一、加载数据¶

加载数据(train+test)¶

In [ ]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
# from sklearn.linear_model import LogisticR2egression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import tree
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import seaborn as sns
# 解决中文乱码问题
import matplotlib
import matplotlib.font_manager as font_manager

matplotlib.rcParams['font.sans-serif'] = ['FZSongYi-Z13S'] # 指定默认字体

matplotlib.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

#显示全部特征
pd.set_option('display.max_columns', None)
# 读取训练集和测试集
train_data = pd.read_csv('./userlostprob_train.txt', sep='\t')
test_data = pd.read_csv('./userlostprob_test.txt', sep='\t')

# 查看数据结构
print(train_data.head())
print(test_data.head())
   label  sampleid           d     arrival  iforderpv_24h  decisionhabit_user  \
0      0     24636  2016-05-18  2016-05-18              0                 NaN   
1      1     24637  2016-05-18  2016-05-18              0                 NaN   
2      0     24641  2016-05-18  2016-05-19              0                 NaN   
3      0     24642  2016-05-18  2016-05-18              0                 NaN   
4      1     24644  2016-05-18  2016-05-19              0                 NaN   

   historyvisit_7ordernum  historyvisit_totalordernum  hotelcr  \
0                     NaN                         NaN     1.04   
1                     NaN                         NaN     1.06   
2                     NaN                         NaN     1.05   
3                     NaN                         NaN     1.01   
4                     NaN                         NaN     1.00   

   ordercanceledprecent  landhalfhours  ordercanncelednum  commentnums  \
0                   NaN           22.0                NaN       1089.0   
1                   NaN            0.0                NaN       5612.0   
2                   NaN            3.0                NaN        256.0   
3                   NaN            2.0                NaN          NaN   
4                   NaN            0.0                NaN          NaN   

   starprefer  novoters  consuming_capacity  historyvisit_avghotelnum  \
0         NaN    1933.0                 NaN                       NaN   
1         NaN    6852.0                 NaN                       NaN   
2         NaN     367.0                 NaN                       NaN   
3         NaN       NaN                 NaN                       NaN   
4         NaN       NaN                 NaN                       NaN   

   cancelrate  historyvisit_visit_detailpagenum  delta_price1  \
0      1261.0                               NaN           NaN   
1      3205.0                               NaN           NaN   
2       194.0                               NaN           NaN   
3         3.0                               NaN           NaN   
4         NaN                               NaN           NaN   

   price_sensitive  hoteluv  businessrate_pre  ordernum_oneyear  cr_pre  \
0              NaN  102.607              0.25               NaN    1.03   
1              NaN  278.373              0.51               NaN    1.07   
2              NaN   16.133              0.61               NaN    1.12   
3              NaN    1.780               NaN               NaN    1.01   
4              NaN    0.073               NaN               NaN    1.03   

   avgprice  lowestprice  firstorder_bu  customereval_pre2  delta_price2  \
0       NaN         49.0            NaN                3.2           NaN   
1       NaN        619.0            NaN                4.9           NaN   
2       NaN        312.0            NaN                3.9           NaN   
3       NaN        198.0            NaN                2.1           NaN   
4       NaN          NaN            NaN                1.5           NaN   

   commentnums_pre  customer_value_profit  commentnums_pre2  cancelrate_pre  \
0            724.0                    NaN             844.0            0.03   
1           5610.0                    NaN            3789.0            0.21   
2           4721.0                    NaN            4341.0            0.52   
3             41.0                    NaN             529.0            0.53   
4              NaN                    NaN               NaN            1.00   

   novoters_pre2  novoters_pre  ctrip_profits  deltaprice_pre2_t1  \
0         1335.0        1249.0            NaN                29.0   
1         5430.0        7829.0            NaN               -56.0   
2         5353.0        7324.0            NaN                 8.0   
3         1004.0          81.0            NaN                -7.0   
4            1.0           NaN            NaN                -5.0   

   lowestprice_pre   uv_pre  uv_pre2  lowestprice_pre2  lasthtlordergap  \
0             46.0   58.027   74.956             615.0              NaN   
1            111.0  249.347  224.920             513.0              NaN   
2            413.0  133.093  112.063             382.0              NaN   
3            188.0    4.600   58.844             203.0              NaN   
4              NaN    0.213    0.157              84.0              NaN   

   businessrate_pre2  cityuvs  cityorders  lastpvgap  cr  sid  \
0               0.29   12.880       3.147        NaN NaN    7   
1               0.53   17.933       4.913        NaN NaN   33   
2               0.60    3.993       0.760        NaN NaN   10   
3               0.18    3.220       0.660        NaN NaN    8   
4                NaN    0.013         NaN        NaN NaN    1   

   visitnum_oneyear   h  
0               NaN  12  
1               NaN  14  
2               NaN  19  
3               NaN  16  
4               NaN  21  
   sampleid           d     arrival  iforderpv_24h  decisionhabit_user  \
0         2  2016-05-22  2016-05-23              0                 4.0   
1         7  2016-05-22  2016-06-15              0                 7.0   
2        14  2016-05-22  2016-05-22              0                 NaN   
3        19  2016-05-22  2016-05-22              0                 1.0   
4        20  2016-05-22  2016-05-22              0                 4.0   

   historyvisit_7ordernum  historyvisit_totalordernum  hotelcr  \
0                     NaN                         4.0     1.03   
1                     NaN                         NaN     1.03   
2                     NaN                         NaN     1.02   
3                     NaN                         5.0     1.01   
4                     NaN                         9.0     1.06   

   ordercanceledprecent  landhalfhours  ordercanncelednum  commentnums  \
0                  0.00            0.0                0.0       3866.0   
1                   NaN            1.0                NaN       1377.0   
2                   NaN            0.0                NaN      11846.0   
3                  0.20           21.0                1.0        242.0   
4                  0.27            2.0                7.0        453.0   

   starprefer  novoters  consuming_capacity  historyvisit_avghotelnum  \
0        96.7    5137.0                63.0                       3.3   
1         NaN    1754.0                 NaN                       7.0   
2         NaN   14931.0                 NaN                       NaN   
3        70.0     329.0                40.0                       1.0   
4        40.0     602.0                 9.0                       5.2   

   cancelrate  historyvisit_visit_detailpagenum  delta_price1  \
0      2191.0                               7.0         167.0   
1      1284.0                              13.0           NaN   
2      6110.0                               NaN           NaN   
3       378.0                               2.0         157.0   
4       174.0                              19.0          17.0   

   price_sensitive   hoteluv  businessrate_pre  ordernum_oneyear  cr_pre  \
0              9.0   300.747              0.52               4.0    1.05   
1              NaN   243.720              0.67               NaN    1.06   
2              NaN  1547.253              0.04               NaN    1.01   
3             24.0   139.827               NaN               5.0    1.03   
4              2.0    12.940              0.01               9.0    1.06   

   avgprice  lowestprice  firstorder_bu  customereval_pre2  delta_price2  \
0     635.0        723.0            NaN                5.0          79.0   
1       NaN        889.0            NaN                5.0           NaN   
2       NaN        722.0            NaN                4.8           NaN   
3     359.0        334.0            NaN                2.9          94.0   
4      97.0        118.0           13.0                2.0          17.0   

   commentnums_pre  customer_value_profit  commentnums_pre2  cancelrate_pre  \
0           1161.0                  3.230            1352.0            0.18   
1           1940.0                    NaN            2767.0            0.29   
2           2089.0                    NaN            5992.0            0.12   
3              2.0                  2.466             220.0            0.36   
4            172.0                 -0.016             102.0            0.21   

   novoters_pre2  novoters_pre  ctrip_profits  deltaprice_pre2_t1  \
0         2146.0        1612.0          3.227               -13.0   
1         4087.0        2689.0          2.853                 NaN   
2         6650.0        3263.0            NaN                 0.0   
3          324.0          10.0          2.460               -14.0   
4          170.0         297.0            NaN                 7.0   

   lowestprice_pre    uv_pre   uv_pre2  lowestprice_pre2  lasthtlordergap  \
0            468.0    51.593   197.800             556.0         149965.0   
1           1090.0   216.500   168.276             833.0              NaN   
2            623.0  1081.507  1136.691             640.0              NaN   
3            202.0     2.200    59.645             265.0         116831.0   
4             80.0     3.827     2.993              79.0           3554.0   

   businessrate_pre2  cityuvs  cityorders  lastpvgap    cr  sid  \
0               0.31    1.773       0.153      204.0  1.00   46   
1               0.60    0.993       0.093     7364.0  1.00   77   
2               0.03   13.067       2.227        NaN   NaN   54   
3               0.05    3.247       0.173     4689.0  1.00   56   
4               0.04    8.747       1.960     2026.0  1.89   75   

   visitnum_oneyear   h  
0            1545.0  22  
1            1084.0  21  
2               NaN   1  
3             336.0   9  
4            1416.0  16  
In [ ]:
def isSame(train_data,test_data):
    # 找出存在于一个DataFrame中但不在另一个中的列
    unique_to_train = train_data.columns.difference(test_data.columns)
    unique_to_test = test_data.columns.difference(train_data.columns)

    # 输出结果
    if unique_to_train.empty and unique_to_test.empty:
        print("字段完全一样。")
    else:
        print("字段不完全一样。")
        print(f"在train_data中独有的字段: {unique_to_train}")
        print(f"在test_data中独有的字段: {unique_to_test}")
isSame(train_data,test_data)
字段不完全一样。
在train_data中独有的字段: Index(['label'], dtype='object')
在test_data中独有的字段: Index([], dtype='object')

查询数据信息¶

基本信息(字段+非Null+类型)¶

In [ ]:
# Column   Non-Null Count   Dtype  
#查看数据信息
train_data.info()
#查看数据信息
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   label                             689945 non-null  int64  
 1   sampleid                          689945 non-null  int64  
 2   d                                 689945 non-null  object 
 3   arrival                           689945 non-null  object 
 4   iforderpv_24h                     689945 non-null  int64  
 5   decisionhabit_user                385450 non-null  float64
 6   historyvisit_7ordernum            82915 non-null   float64
 7   historyvisit_totalordernum        386525 non-null  float64
 8   hotelcr                           689148 non-null  float64
 9   ordercanceledprecent              447831 non-null  float64
 10  landhalfhours                     661312 non-null  float64
 11  ordercanncelednum                 447831 non-null  float64
 12  commentnums                       622029 non-null  float64
 13  starprefer                        464892 non-null  float64
 14  novoters                          672918 non-null  float64
 15  consuming_capacity                463837 non-null  float64
 16  historyvisit_avghotelnum          387876 non-null  float64
 17  cancelrate                        678227 non-null  float64
 18  historyvisit_visit_detailpagenum  307234 non-null  float64
 19  delta_price1                      437146 non-null  float64
 20  price_sensitive                   463837 non-null  float64
 21  hoteluv                           689148 non-null  float64
 22  businessrate_pre                  483896 non-null  float64
 23  ordernum_oneyear                  447831 non-null  float64
 24  cr_pre                            660548 non-null  float64
 25  avgprice                          457261 non-null  float64
 26  lowestprice                       687931 non-null  float64
 27  firstorder_bu                     376993 non-null  float64
 28  customereval_pre2                 661312 non-null  float64
 29  delta_price2                      437750 non-null  float64
 30  commentnums_pre                   598368 non-null  float64
 31  customer_value_profit             439123 non-null  float64
 32  commentnums_pre2                  648457 non-null  float64
 33  cancelrate_pre                    653015 non-null  float64
 34  novoters_pre2                     657616 non-null  float64
 35  novoters_pre                      648956 non-null  float64
 36  ctrip_profits                     445187 non-null  float64
 37  deltaprice_pre2_t1                543180 non-null  float64
 38  lowestprice_pre                   659689 non-null  float64
 39  uv_pre                            660548 non-null  float64
 40  uv_pre2                           661189 non-null  float64
 41  lowestprice_pre2                  660664 non-null  float64
 42  lasthtlordergap                   447831 non-null  float64
 43  businessrate_pre2                 602960 non-null  float64
 44  cityuvs                           682274 non-null  float64
 45  cityorders                        651263 non-null  float64
 46  lastpvgap                         592818 non-null  float64
 47  cr                                457896 non-null  float64
 48  sid                               689945 non-null  int64  
 49  visitnum_oneyear                  592910 non-null  float64
 50  h                                 689945 non-null  int64  
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435075 entries, 0 to 435074
Data columns (total 50 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   sampleid                          435075 non-null  int64  
 1   d                                 435075 non-null  object 
 2   arrival                           435075 non-null  object 
 3   iforderpv_24h                     435075 non-null  int64  
 4   decisionhabit_user                280899 non-null  float64
 5   historyvisit_7ordernum            38543 non-null   float64
 6   historyvisit_totalordernum        270733 non-null  float64
 7   hotelcr                           433263 non-null  float64
 8   ordercanceledprecent              136708 non-null  float64
 9   landhalfhours                     416027 non-null  float64
 10  ordercanncelednum                 136708 non-null  float64
 11  commentnums                       389702 non-null  float64
 12  starprefer                        140768 non-null  float64
 13  novoters                          424681 non-null  float64
 14  consuming_capacity                140516 non-null  float64
 15  historyvisit_avghotelnum          280899 non-null  float64
 16  cancelrate                        426018 non-null  float64
 17  historyvisit_visit_detailpagenum  185397 non-null  float64
 18  delta_price1                      131464 non-null  float64
 19  price_sensitive                   140534 non-null  float64
 20  hoteluv                           433263 non-null  float64
 21  businessrate_pre                  301030 non-null  float64
 22  ordernum_oneyear                  136708 non-null  float64
 23  cr_pre                            414281 non-null  float64
 24  avgprice                          138804 non-null  float64
 25  lowestprice                       433705 non-null  float64
 26  firstorder_bu                     107166 non-null  float64
 27  customereval_pre2                 416027 non-null  float64
 28  delta_price2                      131612 non-null  float64
 29  commentnums_pre                   374104 non-null  float64
 30  customer_value_profit             134568 non-null  float64
 31  commentnums_pre2                  407392 non-null  float64
 32  cancelrate_pre                    409237 non-null  float64
 33  novoters_pre2                     413810 non-null  float64
 34  novoters_pre                      408418 non-null  float64
 35  ctrip_profits                     134329 non-null  float64
 36  deltaprice_pre2_t1                327667 non-null  float64
 37  lowestprice_pre                   415073 non-null  float64
 38  uv_pre                            414281 non-null  float64
 39  uv_pre2                           415713 non-null  float64
 40  lowestprice_pre2                  415560 non-null  float64
 41  lasthtlordergap                   136708 non-null  float64
 42  businessrate_pre2                 378710 non-null  float64
 43  cityuvs                           429798 non-null  float64
 44  cityorders                        359043 non-null  float64
 45  lastpvgap                         167631 non-null  float64
 46  cr                                314164 non-null  float64
 47  sid                               435075 non-null  int64  
 48  visitnum_oneyear                  167666 non-null  float64
 49  h                                 435075 non-null  int64  
dtypes: float64(44), int64(4), object(2)
memory usage: 166.0+ MB

label分布(1+0)¶

In [ ]:
# label分布
train_data.label.value_counts()
Out[ ]:
label
0    500588
1    189357
Name: count, dtype: int64

查看数据形状(行+字段)¶

In [ ]:
train_data.shape  # (689945, 51)
# test_data.shape  # (435075, 50)
Out[ ]:
(689945, 51)

客户流失比率¶

In [ ]:
print('客户流失比率:{0:.2%}'.format(train_data['label'].value_counts()[1]/sum(train_data['label'].value_counts())))
客户流失比率:27.45%

二、清洗数据¶

copy 原始数据(便于后续增删改查)¶

In [ ]:
train_data_rawdf = train_data.copy()
test_data_rawdf = test_data.copy()
In [ ]:
def drop_date(rawdf):
 
    if 'day_advanced' in rawdf.columns:
        return
    rawdf['arrival'] = pd.to_datetime(rawdf['arrival'] )
    rawdf['d'] = pd.to_datetime(rawdf['d'])
    # 生成提前预定天数(衍生变量)(到达日期-访问日期间隔)(看提前多少天订)
    rawdf['day_advanced'] = (rawdf['arrival']-rawdf['d']).dt.days
    # 时间格式
    rawdf['d'] = pd.to_datetime(rawdf['d'], format = '%Y-%m-%d')
    rawdf['arrival'] = pd.to_datetime(rawdf['arrival'], format='%Y-%m-%d')
    # 用户周几入住
    rawdf['arrival_weekday'] = rawdf['arrival'].map(lambda x:x.weekday())
    # 用户入住那天是否为休息日
    def is_weekend(a):
        if int(a) in [0,1,2,3,4]:
            return 0 # 0代表是工作日
        else:
            return 1 # 1代表是周末
    rawdf['is_arrival_weekend'] = rawdf['arrival_weekday'].map(lambda x: is_weekend(x))
    rawdf.drop(labels=['d','arrival'], axis=1, inplace=True)
drop_date(train_data_rawdf)
drop_date(test_data_rawdf)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。
在train_data中独有的字段: Index(['label'], dtype='object')
在test_data中独有的字段: Index([], dtype='object')

查看均值方差(describe)¶

In [ ]:
desc_stats = train_data_rawdf.describe().T

# 计算偏度和峰度
desc_stats['skew'] = train_data_rawdf.skew()
desc_stats['kurt'] = train_data_rawdf.kurt()


# test_data_rawdf['skew'] = test_data_rawdf.skew()
# test_data_rawdf['kurt'] = test_data_rawdf.kurt()

# test_data_rawdf.describe().T
desc_stats.head(10)
# count: 
#   字段缺失,特征列存在不同程度的缺失情况,因此部分特征列的count数不全为689945,
#   如historyvisit_7ordernum仅有82915条,存在数据缺失。后面进行缺失值填充的时候要注意分布的形态。
# 负数:
#   不应为负的数据特征列存在负值的情况,
#   如:delta_price1(用户偏好价格-24小时浏览最多酒店价格)、lowestprice、delta_price2、customer_value_profit(客户近一年的价值),这些负值属于异常情况,后面需要对负值进行处理
# 方差:
#   数据特征列存在极值情况,方差很大,这样的数据需要对其极值进行处理。
Out[ ]:
count mean std min 25% 50% 75% max skew kurt
label 689945.0 0.274452 0.446238 0.0 0.00 0.00 1.00 1.00 1.010888 -0.978109
sampleid 689945.0 628540.209625 414681.498697 24636.0 312320.00 599637.00 887460.00 2238426.00 1.342328 3.576609
iforderpv_24h 689945.0 0.193737 0.395226 0.0 0.00 0.00 0.00 1.00 1.549817 0.401934
decisionhabit_user 385450.0 5.317048 38.524483 0.0 2.00 3.00 5.00 3167.00 50.595261 2685.315258
historyvisit_7ordernum 82915.0 1.856094 2.103862 1.0 1.00 1.00 2.00 106.00 28.261263 1347.099159
historyvisit_totalordernum 386525.0 11.710487 17.251429 1.0 2.00 6.00 14.00 711.00 4.656652 47.507033
hotelcr 689148.0 1.060996 0.045264 1.0 1.03 1.05 1.09 3.18 2.830497 68.146802
ordercanceledprecent 447831.0 0.342119 0.354210 0.0 0.00 0.25 0.57 1.00 0.655980 -0.929859
landhalfhours 661312.0 6.086366 12.413225 0.0 0.00 0.00 4.00 49.00 2.178247 3.569327
ordercanncelednum 447831.0 154.179369 398.456986 0.0 0.00 2.00 153.00 13475.00 7.527943 127.526808

查看数据缺失率(bar)¶

In [ ]:
train_data_rawdf.isnull().mean()
Out[ ]:
label                               0.000000
sampleid                            0.000000
iforderpv_24h                       0.000000
decisionhabit_user                  0.441332
historyvisit_7ordernum              0.879824
historyvisit_totalordernum          0.439774
hotelcr                             0.001155
ordercanceledprecent                0.350918
landhalfhours                       0.041500
ordercanncelednum                   0.350918
commentnums                         0.098437
starprefer                          0.326190
novoters                            0.024679
consuming_capacity                  0.327719
historyvisit_avghotelnum            0.437816
cancelrate                          0.016984
historyvisit_visit_detailpagenum    0.554698
delta_price1                        0.366405
price_sensitive                     0.327719
hoteluv                             0.001155
businessrate_pre                    0.298646
ordernum_oneyear                    0.350918
cr_pre                              0.042608
avgprice                            0.337250
lowestprice                         0.002919
firstorder_bu                       0.453590
customereval_pre2                   0.041500
delta_price2                        0.365529
commentnums_pre                     0.132731
customer_value_profit               0.363539
commentnums_pre2                    0.060132
cancelrate_pre                      0.053526
novoters_pre2                       0.046857
novoters_pre                        0.059409
ctrip_profits                       0.354750
deltaprice_pre2_t1                  0.212720
lowestprice_pre                     0.043853
uv_pre                              0.042608
uv_pre2                             0.041679
lowestprice_pre2                    0.042440
lasthtlordergap                     0.350918
businessrate_pre2                   0.126075
cityuvs                             0.011118
cityorders                          0.056065
lastpvgap                           0.140775
cr                                  0.336330
sid                                 0.000000
visitnum_oneyear                    0.140642
h                                   0.000000
day_advanced                        0.000000
arrival_weekday                     0.000000
is_arrival_weekend                  0.000000
dtype: float64

查看缺失率(train)¶

In [ ]:
#查看缺失值比例
train_data_rawdf.isnull().mean().sort_values(ascending=False).plot(kind='bar', figsize=(20,10))
# 缺失:
#   看出字段缺失情况严重,其中historyvisit_7ordernum缺失值高达88%。除了arrival,d,h,sampleid,iforderpv_24h,sid,label外,
# 填充:
#   其余44列字段各有不同程度缺失。因此后面要根据缺失情况,结合数据特征分布,选用合适的方法填充缺失值。
Out[ ]:
<Axes: >
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
findfont: Generic family 'sans-serif' not found because none of the following families were found: FZSongYi-Z13S
No description has been provided for this image

查看缺失率(test)¶

In [ ]:
#查看缺失值比例
# train_data.isnull().mean().sort_values(ascending=False).plot(kind='bar', figsize=(20,10))
# 缺失:
#   看出字段缺失情况严重,其中historyvisit_7ordernum缺失值高达88%。除了arrival,d,h,sampleid,iforderpv_24h,sid,label外,
# 填充:
#   其余44列字段各有不同程度缺失。因此后面要根据缺失情况,结合数据特征分布,选用合适的方法填充缺失值。

def get_na_ratio(data):
    plt.rcParams['font.sans-serif']=['SimHei']
    data_count = data.count()
    na_count = len(data) - data_count
    na_rate = na_count/len(data)
    a = na_rate.sort_values(ascending=True) #按values正序排列,不放倒序是为了后边的图形展示排列
    a1 = pd.DataFrame(a)
   
    x = data.shape[1]
    fig = plt.figure(figsize=(8,12)) #图形大小
    plt.barh(range(x),a1[0],color='steelblue',alpha=1) 
    plt.xlabel('数据缺失占比') #添加轴标签
    columns1 = a1.index.values.tolist() #列名称
    plt.yticks(range(x),columns1)
    plt.xlim([0,1]) #设置X轴的刻度范围
    for x,y in enumerate(a1[0]):
        plt.text(y,x,'%.3f' %y,va='bottom')
    plt.show()
get_na_ratio(test_data_rawdf)  
No description has been provided for this image

查看数据分布偏态情况(skew)¶

In [ ]:
# train_data_rawdf.skew().sort_values()
train_data_rawdf.head()
Out[ ]:
label sampleid iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum commentnums starprefer novoters consuming_capacity historyvisit_avghotelnum cancelrate historyvisit_visit_detailpagenum delta_price1 price_sensitive hoteluv businessrate_pre ordernum_oneyear cr_pre avgprice lowestprice firstorder_bu customereval_pre2 delta_price2 commentnums_pre customer_value_profit commentnums_pre2 cancelrate_pre novoters_pre2 novoters_pre ctrip_profits deltaprice_pre2_t1 lowestprice_pre uv_pre uv_pre2 lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h day_advanced arrival_weekday is_arrival_weekend
0 0 24636 0 NaN NaN NaN 1.04 NaN 22.0 NaN 1089.0 NaN 1933.0 NaN NaN 1261.0 NaN NaN NaN 102.607 0.25 NaN 1.03 NaN 49.0 NaN 3.2 NaN 724.0 NaN 844.0 0.03 1335.0 1249.0 NaN 29.0 46.0 58.027 74.956 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12 0 2 0
1 1 24637 0 NaN NaN NaN 1.06 NaN 0.0 NaN 5612.0 NaN 6852.0 NaN NaN 3205.0 NaN NaN NaN 278.373 0.51 NaN 1.07 NaN 619.0 NaN 4.9 NaN 5610.0 NaN 3789.0 0.21 5430.0 7829.0 NaN -56.0 111.0 249.347 224.920 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14 0 2 0
2 0 24641 0 NaN NaN NaN 1.05 NaN 3.0 NaN 256.0 NaN 367.0 NaN NaN 194.0 NaN NaN NaN 16.133 0.61 NaN 1.12 NaN 312.0 NaN 3.9 NaN 4721.0 NaN 4341.0 0.52 5353.0 7324.0 NaN 8.0 413.0 133.093 112.063 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19 1 3 0
3 0 24642 0 NaN NaN NaN 1.01 NaN 2.0 NaN NaN NaN NaN NaN NaN 3.0 NaN NaN NaN 1.780 NaN NaN 1.01 NaN 198.0 NaN 2.1 NaN 41.0 NaN 529.0 0.53 1004.0 81.0 NaN -7.0 188.0 4.600 58.844 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16 0 2 0
4 1 24644 0 NaN NaN NaN 1.00 NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.073 NaN NaN 1.03 NaN NaN NaN 1.5 NaN NaN NaN NaN 1.00 1.0 NaN NaN -5.0 NaN 0.213 0.157 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21 1 3 0

数据分布偏态情况¶

In [ ]:
train_data_rawdf.skew().sort_values()
# =0:
#   当数据呈左右对称分布时,偏度系数等于0。
# >1||<-1:
#   偏度系数大于1或小于-1,视为严重偏斜分布;
# -0.5~-1 || 0.5~1:
#   偏度系数为0.5~1或-1~-0.5,视为中等偏斜分布;
# -0.5~0 || 0~0.5:
#   偏度系数为**-0.5~0或0~0.5**,视为轻微偏斜分布。
# 由上面可以看出,除了businessrate_pre2, businessrate_pre, customereval_pre2,其他数据基本都呈很大的偏态分布。

test_data_rawdf.skew().sort_values()
Out[ ]:
delta_price1                       -64.101019
delta_price2                       -18.164185
deltaprice_pre2_t1                  -3.027764
firstorder_bu                       -1.607174
sampleid                            -1.446678
h                                   -0.665304
starprefer                          -0.404704
arrival_weekday                     -0.186010
businessrate_pre2                   -0.071087
customereval_pre2                   -0.041711
businessrate_pre                    -0.011670
ordercanceledprecent                 0.649939
consuming_capacity                   0.931010
is_arrival_weekend                   0.965172
cancelrate_pre                       1.274742
price_sensitive                      1.523604
iforderpv_24h                        1.667979
lasthtlordergap                      1.778979
hotelcr                              1.785177
day_advanced                         1.821458
cr_pre                               1.913542
landhalfhours                        2.109167
cityuvs                              2.241206
cityorders                           2.413386
historyvisit_7ordernum               2.688576
avgprice                             3.060426
ordernum_oneyear                     3.475545
cancelrate                           3.801247
uv_pre2                              3.959311
uv_pre                               4.228314
historyvisit_totalordernum           4.235041
hoteluv                              4.445247
commentnums_pre2                     4.603914
novoters_pre2                        4.632928
lastpvgap                            5.034694
ordercanncelednum                    5.127708
novoters_pre                         5.332249
commentnums_pre                      5.352293
novoters                             5.499180
commentnums                          5.570767
ctrip_profits                        6.034621
customer_value_profit                6.391364
sid                                  7.280452
decisionhabit_user                   7.387129
historyvisit_avghotelnum             7.404542
historyvisit_visit_detailpagenum     9.843808
visitnum_oneyear                    17.849537
lowestprice_pre2                    20.223499
cr                                  24.179435
lowestprice_pre                     56.521439
lowestprice                         62.703681
dtype: float64
In [ ]:
# 查看数据分布图 
train_data_rawdf.hist(figsize=(20,20))
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
        <Axes: title={'center': 'sampleid'}>,
        <Axes: title={'center': 'iforderpv_24h'}>,
        <Axes: title={'center': 'decisionhabit_user'}>,
        <Axes: title={'center': 'historyvisit_7ordernum'}>,
        <Axes: title={'center': 'historyvisit_totalordernum'}>,
        <Axes: title={'center': 'hotelcr'}>],
       [<Axes: title={'center': 'ordercanceledprecent'}>,
        <Axes: title={'center': 'landhalfhours'}>,
        <Axes: title={'center': 'ordercanncelednum'}>,
        <Axes: title={'center': 'commentnums'}>,
        <Axes: title={'center': 'starprefer'}>,
        <Axes: title={'center': 'novoters'}>,
        <Axes: title={'center': 'consuming_capacity'}>],
       [<Axes: title={'center': 'historyvisit_avghotelnum'}>,
        <Axes: title={'center': 'cancelrate'}>,
        <Axes: title={'center': 'historyvisit_visit_detailpagenum'}>,
        <Axes: title={'center': 'delta_price1'}>,
        <Axes: title={'center': 'price_sensitive'}>,
        <Axes: title={'center': 'hoteluv'}>,
        <Axes: title={'center': 'businessrate_pre'}>],
       [<Axes: title={'center': 'ordernum_oneyear'}>,
        <Axes: title={'center': 'cr_pre'}>,
        <Axes: title={'center': 'avgprice'}>,
        <Axes: title={'center': 'lowestprice'}>,
        <Axes: title={'center': 'firstorder_bu'}>,
        <Axes: title={'center': 'customereval_pre2'}>,
        <Axes: title={'center': 'delta_price2'}>],
       [<Axes: title={'center': 'commentnums_pre'}>,
        <Axes: title={'center': 'customer_value_profit'}>,
        <Axes: title={'center': 'commentnums_pre2'}>,
        <Axes: title={'center': 'cancelrate_pre'}>,
        <Axes: title={'center': 'novoters_pre2'}>,
        <Axes: title={'center': 'novoters_pre'}>,
        <Axes: title={'center': 'ctrip_profits'}>],
       [<Axes: title={'center': 'deltaprice_pre2_t1'}>,
        <Axes: title={'center': 'lowestprice_pre'}>,
        <Axes: title={'center': 'uv_pre'}>,
        <Axes: title={'center': 'uv_pre2'}>,
        <Axes: title={'center': 'lowestprice_pre2'}>,
        <Axes: title={'center': 'lasthtlordergap'}>,
        <Axes: title={'center': 'businessrate_pre2'}>],
       [<Axes: title={'center': 'cityuvs'}>,
        <Axes: title={'center': 'cityorders'}>,
        <Axes: title={'center': 'lastpvgap'}>,
        <Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
        <Axes: title={'center': 'visitnum_oneyear'}>,
        <Axes: title={'center': 'h'}>],
       [<Axes: title={'center': 'day_advanced'}>,
        <Axes: title={'center': 'arrival_weekday'}>,
        <Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
        <Axes: >, <Axes: >, <Axes: >]], dtype=object)
No description has been provided for this image

去除重复字段?¶

In [ ]:
train_data_rawdf.drop_duplicates(inplace=True)
train_data_rawdf.shape

test_data_rawdf.drop_duplicates(inplace=True)
test_data_rawdf.shape
Out[ ]:
(435075, 51)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。
在train_data中独有的字段: Index(['label'], dtype='object')
在test_data中独有的字段: Index([], dtype='object')

缺失值删除(缺失值比例大于40%)¶

In [ ]:
# def delete_data(rawdf):
#     print('原来数据维度是:{}'.format(rawdf.shape))
#     def nan_drop(df, axi, rate=0.5):
#         thresh = df.shape[1-axi] * rate
#         df.dropna(axis=axi, thresh=thresh, inplace=True)
#     # 删除缺失值比例大于40%的列
#     nan_drop(rawdf, axi=1, rate=0.6)
#     print('删除缺失率较多的字段后的维度是:{}'.format(rawdf.shape))
# delete_data(train_data_rawdf)  

# delete_data(test_data_rawdf)   

def delete_data(rawdf):
    print('原来数据维度是:{}'.format(rawdf.shape))
    
    def nan_drop(df, axi, rate=0.5):
        original_columns = df.columns.tolist()  # 保存原始列名
        thresh = df.shape[1-axi] * rate
        df.dropna(axis=axi, thresh=thresh, inplace=True)
        # 找出被删除的列
        deleted_columns = [col for col in original_columns if col not in df.columns]
        return deleted_columns
    
    # 删除缺失值比例大于60%的列
    deleted_columns = nan_drop(rawdf, axi=1, rate=0.6)
    print('删除缺失率较多的字段后的维度是:{}'.format(rawdf.shape))
    print('被删除的字段有:{}'.format(deleted_columns))

# 假设 train_data_rawdf 是您的原始 DataFrame 对象
delete_data(train_data_rawdf)
原来数据维度是:(689945, 46)
删除缺失率较多的字段后的维度是:(689945, 46)
被删除的字段有:[]
In [ ]:
# 指定要删除的列名列表
columns_to_drop = [
    'decisionhabit_user', 'historyvisit_7ordernum', 
    'historyvisit_totalordernum', 'historyvisit_avghotelnum', 
    'historyvisit_visit_detailpagenum', 'firstorder_bu'
]

# 删除指定的列
test_data_rawdf.drop(columns=columns_to_drop, axis=1, inplace=True)

# 打印删除列后的DataFrame维度以确认列已被删除
print('删除指定字段后的维度是:{}'.format(test_data_rawdf.shape))
删除指定字段后的维度是:(435075, 45)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。
在train_data中独有的字段: Index(['label'], dtype='object')
在test_data中独有的字段: Index([], dtype='object')

缺失值填充(对于缺失值小于80%)¶

查看含有缺数的数据的偏态¶

In [ ]:
train_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values()  
 # 对于缺失值小于80%的字段,结合数据分布形态填充。
 # 填充:
 #  服从正态分布的使用均值填充,
 #  呈偏态分布的,使用中位数填充。
 
# 由数据偏态信息可知,对starprefer、businessrate_pre2、businessrate_pre、customereval_pre2、
#  ordercanceledprecent、consuming_capacity、cancelrate_pre进行均值填充。 对其他缺失的列进行中位数填充。
# test_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values()  
Out[ ]:
delta_price1            -48.892476
delta_price2            -16.301581
starprefer               -0.361712
businessrate_pre2        -0.109048
businessrate_pre         -0.042611
customereval_pre2        -0.033801
ordercanceledprecent      0.655980
consuming_capacity        1.029861
cancelrate_pre            1.262474
deltaprice_pre2_t1        1.457223
price_sensitive           1.504168
lasthtlordergap           1.536367
cr_pre                    1.776279
cityuvs                   2.034204
cityorders                2.117058
landhalfhours             2.178247
avgprice                  2.700013
hotelcr                   2.830497
cancelrate                3.707977
lastpvgap                 3.862664
uv_pre2                   3.947333
uv_pre                    4.196402
cr                        4.483618
hoteluv                   4.504515
ordernum_oneyear          4.641911
commentnums_pre2          4.768733
novoters_pre2             4.777097
novoters_pre              5.220587
commentnums_pre           5.302130
novoters                  5.388156
commentnums               5.516973
ordercanncelednum         7.527943
ctrip_profits             9.856848
customer_value_profit    12.304766
lowestprice_pre2         21.554698
visitnum_oneyear         23.299890
lowestprice_pre          50.064034
lowestprice              78.040419
dtype: float64
In [ ]:
 #  服从正态分布的使用均值填充,
 #  呈偏态分布的,使用中位数填充。
def nan_fill(df):
    filter_mean = ["businessrate_pre2","cancelrate_pre","businessrate_pre",'starprefer',
    'customereval_pre2','ordercanceledprecent','consuming_capacity']
    for col in df.columns:
        if col in filter_mean:
            df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].median())
    return df
train_data_rawdf = nan_fill(train_data_rawdf)
test_data_rawdf = nan_fill(test_data_rawdf)

查看填充结果¶

In [ ]:
train_data_rawdf.skew()[train_data_rawdf.isnull().mean(0)>0].sort_values()  #0
test_data_rawdf.skew()[test_data_rawdf.isnull().mean(0)>0].sort_values()  #0
Out[ ]:
Series([], dtype: float64)

异常值-负数的处理¶

In [ ]:
# delta_price1(用户偏好价格-24h浏览最多酒店价格)
# delta_price2(用户偏好价格-24h浏览酒店平均价格)
# lowestprice(当前酒店可定最低价格)三者理论上酒店价格不可能为负,
# 填充:
#   并且由数据分布比较集中,因此负值采取中位数处理。
#   customer_value_profit(客户价值_近1年)、ctrip_profits(客户价值)也不应该为负值,
#   分布较为分散,因此将其填充为0

def filter_minus_data(rawdf):
    filter_one=['customer_value_profit','ctrip_profits']
    filter_two=['delta_price1','delta_price2','lowestprice']

    for f in filter_one:
        if f in rawdf:
            rawdf.loc[rawdf[f]<0, f] = 0
    for f in filter_two:
        if f in rawdf:
            rawdf.loc[rawdf[f]<0, f] = rawdf[f].median()
filter_minus_data(train_data_rawdf)
filter_minus_data(test_data_rawdf)

检验负数处理情况(查看最小值是否有负数)¶

In [ ]:
new_1= train_data_rawdf[['lowestprice']].describe()
new_1
# new_1.to_excel('new_1.xlsx')
# df_importance.to_excel('feature_importance.xlsx', index=False)
# train_data_rawdf.describe().T
Out[ ]:
lowestprice
count 689945.000000
mean 318.459732
std 574.977038
min 1.000000
25% 117.000000
50% 200.000000
75% 379.000000
max 100000.000000
In [ ]:
test_data_rawdf[['customer_value_profit','ctrip_profits','delta_price1','delta_price2','lowestprice']].describe()
Out[ ]:
customer_value_profit ctrip_profits delta_price1 delta_price2 lowestprice
count 435075.000000 435075.000000 435075.00000 435075.000000 435075.000000
mean 2.017994 2.702271 109.60285 92.344403 331.389202
std 4.031082 5.626372 136.10511 128.327708 602.492074
min 0.000000 0.000000 0.00000 0.000000 1.000000
25% 1.292000 1.693000 77.00000 62.000000 120.000000
50% 1.292000 1.693000 77.00000 62.000000 214.000000
75% 1.292000 1.693000 77.00000 62.000000 397.000000
max 167.220000 309.153000 6074.00000 6071.000000 100000.000000

极值处理-盖帽法¶

处理数据¶

In [ ]:
#盖帽法:某连续变量6西格玛之外的记录用正负3西格玛值替代,
# 一般正负3西格玛包含99%的数据,所以默认凡小于百分之一分位数和大于百分之九十九分位数的值用百分之一分位数和百分之九十九分位数代替,俗称盖帽法

# 盖帽法函数
def cap_values(series, lower_quantile=0.01, upper_quantile=0.99):
    q_low = series.quantile(lower_quantile)
    q_high = series.quantile(upper_quantile)
    return np.minimum(np.maximum(series, q_low), q_high)

# 应用盖帽法到DataFrame的每一列
def apply_cap_to_df(df, columns):
    for col in columns:
        df[col] = cap_values(df[col])
    return df

# 假设rawdf是你的DataFrame
# rawdf = pd.DataFrame(...)

# 需要处理的列名列表
# columns_to_cap = ['column1', 'column2', 'column3']  # 根据实际情况替换列名

# 应用盖帽法
train_data_rawdf = apply_cap_to_df(train_data_rawdf, train_data_rawdf.columns)
test_data_rawdf = apply_cap_to_df(test_data_rawdf, test_data_rawdf.columns)

# test_data_rawdf.head(10)
# def get_percentile_data(data1):
        
#     # 初始化一个空的DataFrame来存储结果
#     result_df = pd.DataFrame()

#     # 遍历data1中的每一列
#     for column in data1.columns:
#         # 计算第1百分位数和第99百分位数
#         p1 = np.percentile(data1[column], 1)
#         p99 = np.percentile(data1[column], 99)
        
#         # 找出小于第1百分位数的值
#         values_below_p1 = data1[data1[column] < p1]
        
#         # 找出大于第99百分位数的值
#         values_above_p99 = data1[data1[column] > p99]
        
#         # 将结果合并到result_df中
#         result_df = pd.concat([result_df, values_below_p1, values_above_p99])   
#     # 删除重复的行并重置索引
#     result_df = result_df.drop_duplicates().reset_index(drop=True)
#     return result_df
# ret = get_percentile_data(train_data_rawdf)

再次查看数据分布偏态情况(skew)¶

In [ ]:
# 查看表现
train_data_rawdf.skew().sort_values()
# test_data_rawdf.skew().sort_values()
Out[ ]:
h                       -0.665606
arrival_weekday         -0.186567
businessrate_pre2       -0.136561
starprefer              -0.126118
businessrate_pre        -0.072562
customereval_pre2       -0.043090
deltaprice_pre2_t1      -0.032096
ordercanceledprecent     0.814217
hotelcr                  0.829181
cr_pre                   0.841635
is_arrival_weekend       0.984591
label                    1.010888
consuming_capacity       1.279602
cancelrate_pre           1.327993
sampleid                 1.339944
iforderpv_24h            1.549817
novoters_pre2            1.982481
commentnums_pre2         1.998712
cityuvs                  2.045279
cancelrate               2.066955
avgprice                 2.076221
day_advanced             2.081788
price_sensitive          2.108583
lasthtlordergap          2.189324
lowestprice_pre2         2.210016
cityorders               2.212808
landhalfhours            2.243412
novoters                 2.263438
commentnums              2.354108
novoters_pre             2.444956
commentnums_pre          2.471846
lowestprice_pre          2.479717
lowestprice              2.596511
sid                      2.686568
cr                       2.752425
delta_price1             2.897290
delta_price2             3.022332
uv_pre2                  3.116940
ordernum_oneyear         3.132565
uv_pre                   3.287560
hoteluv                  3.397452
ordercanncelednum        3.649391
customer_value_profit    3.653897
ctrip_profits            3.680904
lastpvgap                3.773843
visitnum_oneyear         8.120348
dtype: float64
In [ ]:
train_data_rawdf.hist(figsize=(20,20)) 
# plt.savefig('./images/data_distribution_raw.png')
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
        <Axes: title={'center': 'sampleid'}>,
        <Axes: title={'center': 'iforderpv_24h'}>,
        <Axes: title={'center': 'hotelcr'}>,
        <Axes: title={'center': 'ordercanceledprecent'}>,
        <Axes: title={'center': 'landhalfhours'}>,
        <Axes: title={'center': 'ordercanncelednum'}>],
       [<Axes: title={'center': 'commentnums'}>,
        <Axes: title={'center': 'starprefer'}>,
        <Axes: title={'center': 'novoters'}>,
        <Axes: title={'center': 'consuming_capacity'}>,
        <Axes: title={'center': 'cancelrate'}>,
        <Axes: title={'center': 'delta_price1'}>,
        <Axes: title={'center': 'price_sensitive'}>],
       [<Axes: title={'center': 'hoteluv'}>,
        <Axes: title={'center': 'businessrate_pre'}>,
        <Axes: title={'center': 'ordernum_oneyear'}>,
        <Axes: title={'center': 'cr_pre'}>,
        <Axes: title={'center': 'avgprice'}>,
        <Axes: title={'center': 'lowestprice'}>,
        <Axes: title={'center': 'customereval_pre2'}>],
       [<Axes: title={'center': 'delta_price2'}>,
        <Axes: title={'center': 'commentnums_pre'}>,
        <Axes: title={'center': 'customer_value_profit'}>,
        <Axes: title={'center': 'commentnums_pre2'}>,
        <Axes: title={'center': 'cancelrate_pre'}>,
        <Axes: title={'center': 'novoters_pre2'}>,
        <Axes: title={'center': 'novoters_pre'}>],
       [<Axes: title={'center': 'ctrip_profits'}>,
        <Axes: title={'center': 'deltaprice_pre2_t1'}>,
        <Axes: title={'center': 'lowestprice_pre'}>,
        <Axes: title={'center': 'uv_pre'}>,
        <Axes: title={'center': 'uv_pre2'}>,
        <Axes: title={'center': 'lowestprice_pre2'}>,
        <Axes: title={'center': 'lasthtlordergap'}>],
       [<Axes: title={'center': 'businessrate_pre2'}>,
        <Axes: title={'center': 'cityuvs'}>,
        <Axes: title={'center': 'cityorders'}>,
        <Axes: title={'center': 'lastpvgap'}>,
        <Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
        <Axes: title={'center': 'visitnum_oneyear'}>],
       [<Axes: title={'center': 'h'}>,
        <Axes: title={'center': 'day_advanced'}>,
        <Axes: title={'center': 'arrival_weekday'}>,
        <Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
        <Axes: >, <Axes: >]], dtype=object)
No description has been provided for this image

再次检查数据-箱型图(极值和负值)¶

In [ ]:
# def show_box(rawdf):
#     plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
#     plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

#     # 遍历所有列
#     for column in rawdf.columns:
#         plt.figure(figsize=(4, 8), dpi=100)
#         plt.boxplot(rawdf[column].dropna().values)  # 使用列名来访问数据
#         plt.xlabel(column)
#         plt.show()

# # 假设train_data_rawdf是你的DataFrame
# show_box(train_data_rawdf)
# show_box(test_data_rawdf)

def show_box(rawdf):
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

    # 计算需要多少行来放置所有的箱线图
    num_rows = len(rawdf.columns) // 2 + (len(rawdf.columns) % 2 > 0)  # 向上取整

    # 创建一个足够大的画布,每个子图的大小为4x8
    plt.figure(figsize=(10, num_rows * 4), dpi=100)

    # 遍历所有列,并在子图上绘制箱线图
    for i, column in enumerate(rawdf.columns):
        ax = plt.subplot(num_rows, 13, i + 1)  # 创建子图
        ax.boxplot(rawdf[column].dropna().values)  # 使用列名来访问数据
        ax.set_xlabel(column,rotation=90)  # 设置x轴标签
    # 旋转日期标签,使其竖直显示
    # plt.xticks(rotation=45)

    # 自动格式化x轴日期间隔
    # plt.gcf().autofmt_xdate()
    # 调整子图间距
    plt.tight_layout()
    plt.show()

# 假设train_data_rawdf和test_data_rawdf是你的DataFrame
show_box(train_data_rawdf)
# show_box(test_data_rawdf)
No description has been provided for this image

梳理列与列之间的关系¶

相关性¶

In [ ]:
# 查看数据分布图 
train_data_rawdf.hist(figsize=(20,20))
# plt.savefig('./images/data_distribution_raw.png')
Out[ ]:
array([[<Axes: title={'center': 'label'}>,
        <Axes: title={'center': 'sampleid'}>,
        <Axes: title={'center': 'iforderpv_24h'}>,
        <Axes: title={'center': 'hotelcr'}>,
        <Axes: title={'center': 'ordercanceledprecent'}>,
        <Axes: title={'center': 'landhalfhours'}>,
        <Axes: title={'center': 'ordercanncelednum'}>],
       [<Axes: title={'center': 'commentnums'}>,
        <Axes: title={'center': 'starprefer'}>,
        <Axes: title={'center': 'novoters'}>,
        <Axes: title={'center': 'consuming_capacity'}>,
        <Axes: title={'center': 'cancelrate'}>,
        <Axes: title={'center': 'delta_price1'}>,
        <Axes: title={'center': 'price_sensitive'}>],
       [<Axes: title={'center': 'hoteluv'}>,
        <Axes: title={'center': 'businessrate_pre'}>,
        <Axes: title={'center': 'ordernum_oneyear'}>,
        <Axes: title={'center': 'cr_pre'}>,
        <Axes: title={'center': 'avgprice'}>,
        <Axes: title={'center': 'lowestprice'}>,
        <Axes: title={'center': 'customereval_pre2'}>],
       [<Axes: title={'center': 'delta_price2'}>,
        <Axes: title={'center': 'commentnums_pre'}>,
        <Axes: title={'center': 'customer_value_profit'}>,
        <Axes: title={'center': 'commentnums_pre2'}>,
        <Axes: title={'center': 'cancelrate_pre'}>,
        <Axes: title={'center': 'novoters_pre2'}>,
        <Axes: title={'center': 'novoters_pre'}>],
       [<Axes: title={'center': 'ctrip_profits'}>,
        <Axes: title={'center': 'deltaprice_pre2_t1'}>,
        <Axes: title={'center': 'lowestprice_pre'}>,
        <Axes: title={'center': 'uv_pre'}>,
        <Axes: title={'center': 'uv_pre2'}>,
        <Axes: title={'center': 'lowestprice_pre2'}>,
        <Axes: title={'center': 'lasthtlordergap'}>],
       [<Axes: title={'center': 'businessrate_pre2'}>,
        <Axes: title={'center': 'cityuvs'}>,
        <Axes: title={'center': 'cityorders'}>,
        <Axes: title={'center': 'lastpvgap'}>,
        <Axes: title={'center': 'cr'}>, <Axes: title={'center': 'sid'}>,
        <Axes: title={'center': 'visitnum_oneyear'}>],
       [<Axes: title={'center': 'h'}>,
        <Axes: title={'center': 'day_advanced'}>,
        <Axes: title={'center': 'arrival_weekday'}>,
        <Axes: title={'center': 'is_arrival_weekend'}>, <Axes: >,
        <Axes: >, <Axes: >]], dtype=object)
No description has been provided for this image
In [ ]:
def corr_user1(rawdf):
    # missing_columns = [col for col in user_features if col not in rawdf.columns]
    # copy_user_features = user_features.copy()
    # copy_user_features = [col for col in copy_user_features if col not in missing_columns]
    # print(copy_user_features)
    mat = rawdf[rawdf.columns].corr()
    # 绘制用户特征的相关性矩阵热度图
    fig, ax = plt.subplots(figsize=(18*2,12*2))
    sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="YlGnBu")

    plt.show()

    return mat
train_data_rawdf_mat = corr_user1(train_data_rawdf)
train_data_rawdf_mat
No description has been provided for this image
Out[ ]:
label sampleid iforderpv_24h hotelcr ordercanceledprecent landhalfhours ordercanncelednum commentnums starprefer novoters consuming_capacity cancelrate delta_price1 price_sensitive hoteluv businessrate_pre ordernum_oneyear cr_pre avgprice lowestprice customereval_pre2 delta_price2 commentnums_pre customer_value_profit commentnums_pre2 cancelrate_pre novoters_pre2 novoters_pre ctrip_profits deltaprice_pre2_t1 lowestprice_pre uv_pre uv_pre2 lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h day_advanced arrival_weekday is_arrival_weekend
label 1.000000 -0.000492 0.110308 0.121993 -0.005425 0.030844 0.110617 0.002726 -0.006543 0.006997 -0.024266 0.013711 0.026361 0.018370 -0.052476 0.114840 0.150235 0.119472 -0.016352 -0.065190 -0.027011 0.028846 0.000620 0.091545 -0.004073 0.018038 0.001309 0.005137 0.088149 0.007178 -0.055868 -0.055836 -0.063700 -0.067297 -0.058852 0.131459 0.101187 0.102336 0.010801 0.184888 0.016431 -0.049722 -0.077728 -0.153983 0.002958 -0.009220
sampleid -0.000492 1.000000 -0.000423 0.000542 -0.003193 0.001002 -0.000505 0.001947 -0.001072 0.002116 -0.000317 0.001320 -0.000902 0.001268 0.000178 0.003181 0.001463 0.000043 -0.000315 0.001775 0.001034 -0.000673 0.000614 0.002863 0.001695 -0.000915 0.002169 0.000832 0.001199 0.000446 0.001802 -0.000657 -0.000835 0.001387 -0.001727 0.001687 0.000799 0.000759 0.001157 0.000509 0.002542 0.000750 0.001812 0.000224 -0.001522 0.001464
iforderpv_24h 0.110308 -0.000423 1.000000 0.024071 0.012170 0.233447 -0.015206 0.013740 -0.025380 0.015740 -0.029280 0.012095 -0.022466 -0.022761 0.002599 0.009458 -0.014624 0.061656 -0.040083 -0.041892 -0.031826 -0.026489 0.044817 -0.029591 0.032517 -0.028610 0.036947 0.049872 -0.025420 0.006945 -0.037532 0.019053 0.009943 -0.044412 -0.017532 0.012048 0.039251 0.037020 -0.005508 -0.000848 -0.036166 0.073917 -0.012849 -0.011125 0.008014 0.007079
hotelcr 0.121993 0.000542 0.024071 1.000000 0.011143 -0.009319 0.070682 0.077973 -0.032843 0.111510 -0.096958 0.145374 -0.007506 -0.027991 -0.195894 0.393174 0.092868 0.455088 -0.087715 -0.212134 -0.018209 0.007050 -0.042322 0.008420 -0.057016 0.045860 -0.037198 -0.028255 0.011452 0.025691 -0.149879 -0.193813 -0.221810 -0.184937 -0.036398 0.462603 0.010997 0.031117 0.008732 0.104793 0.018775 0.012877 0.019573 -0.138218 -0.012310 -0.018519
ordercanceledprecent -0.005425 -0.003193 0.012170 0.011143 1.000000 -0.003579 0.325672 -0.016809 -0.069449 -0.017515 -0.064174 -0.018120 -0.072026 -0.033402 -0.020664 0.010344 0.082158 0.012103 -0.069562 -0.024391 -0.031568 -0.073073 -0.017312 -0.019029 -0.021717 0.027567 -0.022887 -0.017762 -0.016109 -0.010135 -0.025645 -0.020522 -0.022974 -0.028283 0.015938 0.013720 -0.001178 -0.000362 -0.000853 -0.134614 0.018306 0.006269 -0.000564 0.002098 0.005056 -0.001970
landhalfhours 0.030844 0.001002 0.233447 -0.009319 -0.003579 1.000000 0.011840 0.036527 0.007570 0.037715 0.002310 0.039000 0.010923 -0.000761 0.041235 -0.026266 0.024762 0.011789 0.003704 0.012501 0.036590 0.001920 0.059744 0.007725 0.056329 -0.047855 0.057026 0.061994 0.006446 0.008655 0.024001 0.057284 0.051673 0.030444 -0.015678 -0.028497 0.045452 0.043500 -0.054367 -0.011540 0.039231 0.108321 0.001964 -0.034362 0.008332 0.030208
ordercanncelednum 0.110617 -0.000505 -0.015206 0.070682 0.325672 0.011840 1.000000 0.021125 0.072213 0.022873 0.014271 0.024535 0.043845 0.007129 -0.008243 0.092955 0.697471 0.071068 0.033386 0.029601 0.075713 0.039229 0.016432 0.352043 0.020932 -0.015835 0.023322 0.017448 0.323278 0.007086 0.034699 -0.011653 -0.013259 0.038869 -0.077737 0.103838 0.023551 0.027659 -0.069852 0.181734 0.241964 0.043761 -0.003672 -0.067534 -0.021776 -0.018639
commentnums 0.002726 0.001947 0.013740 0.077973 -0.016809 0.036527 0.021125 1.000000 0.170337 0.986627 0.172716 0.837859 0.068394 0.028776 0.682082 -0.017608 0.031688 -0.048611 0.169607 0.224027 0.366761 0.054512 0.415256 0.111180 0.509763 -0.214569 0.511192 0.416398 0.100786 0.028964 0.220525 0.405871 0.454397 0.254302 0.014486 -0.005398 0.090860 0.070892 -0.017876 -0.008408 0.063053 0.005725 -0.002646 0.074934 -0.013678 -0.007720
starprefer -0.006543 -0.001072 -0.025380 -0.032843 -0.069449 0.007570 0.072213 0.170337 1.000000 0.171033 0.728197 0.182129 0.400740 0.038752 0.166661 0.017359 0.120942 -0.039899 0.666243 0.259878 0.356788 0.401197 0.160576 0.385153 0.203028 -0.156913 0.204954 0.160275 0.351759 0.041728 0.267120 0.160029 0.180710 0.308946 -0.052847 0.012737 0.018577 0.013694 -0.058683 -0.007503 0.136853 0.020643 -0.002809 0.031312 -0.014619 -0.010775
novoters 0.006997 0.002116 0.015740 0.111510 -0.017515 0.037715 0.022873 0.986627 0.171033 1.000000 0.171493 0.853580 0.068881 0.028533 0.678037 -0.004879 0.034418 -0.033836 0.168560 0.219171 0.369402 0.055179 0.417601 0.112363 0.512748 -0.217408 0.518597 0.422311 0.101811 0.033301 0.218488 0.403685 0.451455 0.251498 0.013645 0.009604 0.095031 0.075434 -0.017574 -0.005125 0.063840 0.007025 -0.002321 0.071222 -0.013623 -0.007789
consuming_capacity -0.024266 -0.000317 -0.029280 -0.096958 -0.064174 0.002310 0.014271 0.172716 0.728197 0.171493 1.000000 0.186699 0.589737 0.241250 0.195983 -0.035674 0.059060 -0.104799 0.899684 0.305614 0.333177 0.599844 0.163096 0.459208 0.207075 -0.152446 0.206418 0.161232 0.420644 0.062676 0.312567 0.189766 0.213273 0.362639 -0.000045 -0.046877 0.041321 0.031740 -0.043740 -0.060208 0.130227 0.017015 0.001819 0.083663 -0.003290 -0.005326
cancelrate 0.013711 0.001320 0.012095 0.145374 -0.018120 0.039000 0.024535 0.837859 0.182129 0.853580 0.186699 1.000000 0.075817 0.030782 0.756565 0.030727 0.037084 -0.022484 0.184900 0.246753 0.388280 0.060310 0.408483 0.124846 0.511110 -0.202260 0.515620 0.413909 0.115460 0.042032 0.239832 0.428912 0.480678 0.276399 0.013571 0.048090 0.151089 0.127710 -0.018667 -0.002253 0.069283 0.004450 -0.004663 0.067196 -0.014954 -0.008705
delta_price1 0.026361 -0.000902 -0.022466 -0.007506 -0.072026 0.010923 0.043845 0.068394 0.400740 0.068881 0.589737 0.075817 1.000000 0.217116 0.050365 0.019380 0.060427 -0.002964 0.707538 0.074178 0.108193 0.909763 0.071632 0.334857 0.089728 -0.057887 0.091642 0.072447 0.320884 0.044572 -0.095753 0.048194 0.058305 0.017120 0.077568 0.023504 0.041702 0.039969 -0.022239 0.040365 0.137399 0.019858 0.006350 0.009858 -0.011647 -0.012393
price_sensitive 0.018370 0.001268 -0.022761 -0.027991 -0.033402 -0.000761 0.007129 0.028776 0.038752 0.028533 0.241250 0.030782 0.217116 1.000000 0.032767 -0.015988 0.019737 -0.025215 0.245152 0.060616 0.058495 0.224172 0.029470 0.154500 0.034977 -0.027585 0.034775 0.029307 0.120446 0.016819 0.061849 0.030434 0.034278 0.073014 0.017887 -0.021591 0.020399 0.016766 -0.008366 0.056799 0.053669 0.020829 -0.000671 0.037552 0.005733 -0.000249
hoteluv -0.052476 0.000178 0.002599 -0.195894 -0.020664 0.041235 -0.008243 0.682082 0.166661 0.678037 0.195983 0.756565 0.050365 0.032767 1.000000 -0.206681 -0.006906 -0.201410 0.192592 0.372250 0.401654 0.032113 0.400708 0.094996 0.508152 -0.255675 0.497864 0.397479 0.084697 -0.011403 0.310592 0.611926 0.693862 0.364769 0.030016 -0.208543 0.059585 0.029337 -0.020363 -0.045512 0.051564 0.000132 -0.011013 0.136684 -0.013809 -0.002814
businessrate_pre 0.114840 0.003181 0.009458 0.393174 0.010344 -0.026266 0.092955 -0.017608 0.017359 -0.004879 -0.035674 0.030727 0.019380 -0.015988 -0.206681 1.000000 0.118832 0.521287 -0.023756 -0.081715 -0.018557 0.028578 -0.033132 0.063066 -0.043829 0.113483 -0.022617 -0.017423 0.066384 0.083459 -0.093702 -0.287193 -0.288367 -0.116378 -0.041595 0.839675 0.170056 0.183577 0.008094 0.118519 0.042884 0.010181 0.015272 -0.164622 -0.022992 -0.031647
ordernum_oneyear 0.150235 0.001463 -0.014624 0.092868 0.082158 0.024762 0.697471 0.031688 0.120942 0.034418 0.059060 0.037084 0.060427 0.019737 -0.006906 0.118832 1.000000 0.092990 0.074879 0.039091 0.099951 0.055956 0.024877 0.589498 0.032441 -0.028284 0.036702 0.026763 0.513736 0.017308 0.044471 -0.011691 -0.012170 0.049793 -0.257930 0.133643 0.030157 0.034742 -0.119503 0.282769 0.316929 0.070454 -0.007748 -0.081668 -0.039014 -0.029886
cr_pre 0.119472 0.000043 0.061656 0.455088 0.012103 0.011789 0.071068 -0.048611 -0.039899 -0.033836 -0.104799 -0.022484 -0.002964 -0.025215 -0.201410 0.521287 0.092990 1.000000 -0.094573 -0.157166 -0.005069 0.008205 0.051888 0.004601 -0.017631 -0.067697 0.013853 0.084736 0.008191 0.030638 -0.215729 -0.224403 -0.249726 -0.230065 -0.036883 0.531262 0.012648 0.033255 0.006104 0.104715 0.017455 0.020455 0.014137 -0.142703 -0.012612 -0.020682
avgprice -0.016352 -0.000315 -0.040083 -0.087715 -0.069562 0.003704 0.033386 0.169607 0.666243 0.168560 0.899684 0.184900 0.707538 0.245152 0.192592 -0.023756 0.074879 -0.094573 1.000000 0.314600 0.337313 0.724924 0.157997 0.490475 0.200693 -0.151754 0.200586 0.156171 0.456162 0.066985 0.321117 0.184727 0.207600 0.372497 0.030867 -0.032136 0.041208 0.031936 -0.045533 -0.023160 0.165819 0.026597 0.003963 0.083902 -0.006420 -0.010198
lowestprice -0.065190 0.001775 -0.041892 -0.212134 -0.024391 0.012501 0.029601 0.224027 0.259878 0.219171 0.305614 0.246753 0.074178 0.060616 0.372250 -0.081715 0.039091 -0.157166 0.314600 1.000000 0.418775 0.045705 0.197605 0.189463 0.255902 -0.176545 0.251468 0.194376 0.176076 0.044972 0.482142 0.293774 0.336927 0.565141 0.027427 -0.100605 0.042023 0.023651 -0.027053 -0.021063 0.109760 0.023946 -0.006463 0.130505 0.015271 0.011663
customereval_pre2 -0.027011 0.001034 -0.031826 -0.018209 -0.031568 0.036590 0.075713 0.366761 0.356788 0.369402 0.333177 0.388280 0.108193 0.058495 0.401654 -0.018557 0.099951 -0.005069 0.337313 0.418775 1.000000 0.081975 0.434567 0.238933 0.542253 -0.442958 0.545988 0.436519 0.220543 0.025923 0.516774 0.475066 0.533003 0.592722 0.012611 -0.019580 -0.055406 -0.067485 -0.049631 0.003828 0.147123 0.036845 -0.014246 0.107125 -0.022543 -0.011237
delta_price2 0.028846 -0.000673 -0.026489 0.007050 -0.073073 0.001920 0.039229 0.054512 0.401197 0.055179 0.599844 0.060310 0.909763 0.224172 0.032113 0.028578 0.055956 0.008205 0.724924 0.045705 0.081975 1.000000 0.057185 0.337208 0.068761 -0.052914 0.071564 0.058056 0.322430 0.034209 -0.027662 0.034070 0.035790 -0.031884 0.081158 0.032584 0.030479 0.031175 -0.017904 0.043167 0.132286 0.017452 0.007739 -0.000622 -0.012919 -0.012592
commentnums_pre 0.000620 0.000614 0.044817 -0.042322 -0.017312 0.059744 0.016432 0.415256 0.160576 0.417601 0.163096 0.408483 0.071632 0.029470 0.400708 -0.033132 0.024877 0.051888 0.157997 0.197605 0.434567 0.057185 1.000000 0.099698 0.821527 -0.310531 0.822742 0.986821 0.091443 0.038209 0.231631 0.684716 0.600708 0.261593 0.016245 -0.018595 0.081078 0.061015 -0.014918 -0.011920 0.054643 0.014873 0.001200 0.085242 -0.016114 -0.007082
customer_value_profit 0.091545 0.002863 -0.029591 0.008420 -0.019029 0.007725 0.352043 0.111180 0.385153 0.112363 0.459208 0.124846 0.334857 0.154500 0.094996 0.063066 0.589498 0.004601 0.490475 0.189463 0.238933 0.337208 0.099698 1.000000 0.126725 -0.093388 0.129934 0.100088 0.846496 0.051508 0.198176 0.089146 0.100517 0.227226 -0.140166 0.065567 0.046860 0.044189 -0.095766 0.183976 0.282234 0.060931 -0.004408 -0.004478 -0.026940 -0.024283
commentnums_pre2 -0.004073 0.001695 0.032517 -0.057016 -0.021717 0.056329 0.020932 0.509763 0.203028 0.512748 0.207075 0.511110 0.089728 0.034977 0.508152 -0.043829 0.032441 -0.017631 0.200693 0.255902 0.542253 0.068761 0.821527 0.126725 1.000000 -0.329677 0.981899 0.815104 0.115448 0.045098 0.287523 0.673500 0.739316 0.332534 0.019501 -0.025075 0.104632 0.078952 -0.021515 -0.014350 0.072572 0.012653 -0.003364 0.106090 -0.019686 -0.007454
cancelrate_pre 0.018038 -0.000915 -0.028610 0.045860 0.027567 -0.047855 -0.015835 -0.214569 -0.156913 -0.217408 -0.152446 -0.202260 -0.057887 -0.027585 -0.255675 0.113483 -0.028284 -0.067697 -0.151754 -0.176545 -0.442958 -0.052914 -0.310531 -0.093388 -0.329677 1.000000 -0.340087 -0.322329 -0.084365 -0.009206 -0.220220 -0.320180 -0.330733 -0.237741 -0.011567 0.144381 0.077926 0.088606 0.028619 0.013600 -0.062432 -0.024932 0.010556 -0.089451 0.013312 0.002962
novoters_pre2 0.001309 0.002169 0.036947 -0.037198 -0.022887 0.057026 0.023322 0.511192 0.204954 0.518597 0.206418 0.515620 0.091642 0.034775 0.497864 -0.022617 0.036702 0.013853 0.200586 0.251468 0.545988 0.071564 0.822742 0.129934 0.981899 -0.340087 1.000000 0.833242 0.118689 0.055052 0.282702 0.666136 0.727811 0.325686 0.018249 -0.000377 0.108625 0.083775 -0.020732 -0.009001 0.073381 0.015749 -0.002815 0.099393 -0.020253 -0.008146
novoters_pre 0.005137 0.000832 0.049872 -0.028255 -0.017762 0.061994 0.017448 0.416398 0.160275 0.422311 0.161232 0.413909 0.072447 0.029307 0.397479 -0.017423 0.026763 0.084736 0.156171 0.194376 0.436519 0.058056 0.986821 0.100088 0.815104 -0.322329 0.833242 1.000000 0.091892 0.043769 0.226831 0.681359 0.596813 0.256747 0.015645 -0.002446 0.085365 0.065592 -0.014033 -0.008970 0.054241 0.017952 0.001227 0.081802 -0.016665 -0.007869
ctrip_profits 0.088149 0.001199 -0.025420 0.011452 -0.016109 0.006446 0.323278 0.100786 0.351759 0.101811 0.420644 0.115460 0.320884 0.120446 0.084697 0.066384 0.513736 0.008191 0.456162 0.176076 0.220543 0.322430 0.091443 0.846496 0.115448 -0.084365 0.118689 0.091892 1.000000 0.052051 0.183182 0.079768 0.089452 0.211305 -0.096864 0.070251 0.052718 0.050835 -0.094942 0.159805 0.286329 0.065248 0.000656 -0.013352 -0.029033 -0.025471
deltaprice_pre2_t1 0.007178 0.000446 0.006945 0.025691 -0.010135 0.008655 0.007086 0.028964 0.041728 0.033301 0.062676 0.042032 0.044572 0.016819 -0.011403 0.083459 0.017308 0.030638 0.066985 0.044972 0.025923 0.034209 0.038209 0.051508 0.045098 -0.009206 0.055052 0.043769 0.052051 1.000000 0.071691 -0.011070 -0.016392 0.080952 -0.001456 0.090499 0.086132 0.085994 -0.000505 0.018554 0.022127 0.001160 0.001640 -0.008466 -0.004573 -0.004313
lowestprice_pre -0.055868 0.001802 -0.037532 -0.149879 -0.025645 0.024001 0.034699 0.220525 0.267120 0.218488 0.312567 0.239832 -0.095753 0.061849 0.310592 -0.093702 0.044471 -0.215729 0.321117 0.482142 0.516774 -0.027662 0.231631 0.198176 0.287523 -0.220220 0.282702 0.226831 0.183182 0.071691 1.000000 0.371156 0.391096 0.847433 0.028872 -0.112667 0.043150 0.024742 -0.024253 -0.016253 0.109393 0.016503 -0.005723 0.135596 0.012064 0.008757
uv_pre -0.055836 -0.000657 0.019053 -0.193813 -0.020522 0.057284 -0.011653 0.405871 0.160029 0.403685 0.189766 0.428912 0.048194 0.030434 0.611926 -0.287193 -0.011691 -0.224403 0.184727 0.293774 0.475066 0.034070 0.684716 0.089146 0.673500 -0.320180 0.666136 0.681359 0.079768 -0.011070 0.371156 1.000000 0.899233 0.403663 0.030990 -0.250915 0.047793 0.017647 -0.016644 -0.049527 0.045093 0.006259 -0.006515 0.149208 -0.018078 -0.004403
uv_pre2 -0.063700 -0.000835 0.009943 -0.221810 -0.022974 0.051673 -0.013259 0.454397 0.180710 0.451455 0.213273 0.480678 0.058305 0.034278 0.693862 -0.288367 -0.012170 -0.249726 0.207600 0.336927 0.533003 0.035790 0.600708 0.100517 0.739316 -0.330733 0.727811 0.596813 0.089452 -0.016392 0.391096 0.899233 1.000000 0.456633 0.034105 -0.282605 0.056401 0.021987 -0.020083 -0.053981 0.051593 0.002284 -0.012470 0.166380 -0.018840 -0.002906
lowestprice_pre2 -0.067297 0.001387 -0.044412 -0.184937 -0.028283 0.030444 0.038869 0.254302 0.308946 0.251498 0.362639 0.276399 0.017120 0.073014 0.364769 -0.116378 0.049793 -0.230065 0.372497 0.565141 0.592722 -0.031884 0.261593 0.227226 0.332534 -0.237741 0.325686 0.256747 0.211305 0.080952 0.847433 0.403663 0.456633 1.000000 0.031635 -0.142615 0.055594 0.032771 -0.030310 -0.020870 0.128235 0.021339 -0.005808 0.160922 0.014025 0.011842
lasthtlordergap -0.058852 -0.001727 -0.017532 -0.036398 0.015938 -0.015678 -0.077737 0.014486 -0.052847 0.013645 -0.000045 0.013571 0.077568 0.017887 0.030016 -0.041595 -0.257930 -0.036883 0.030867 0.027427 0.012611 0.081158 0.016245 -0.140166 0.019501 -0.011567 0.018249 0.015645 -0.096864 -0.001456 0.028872 0.030990 0.034105 0.031635 1.000000 -0.049963 -0.022230 -0.024010 0.155794 -0.278271 -0.003972 -0.036863 0.013465 0.077942 0.010573 0.003588
businessrate_pre2 0.131459 0.001687 0.012048 0.462603 0.013720 -0.028497 0.103838 -0.005398 0.012737 0.009604 -0.046877 0.048090 0.023504 -0.021591 -0.208543 0.839675 0.133643 0.531262 -0.032136 -0.100605 -0.019580 0.032584 -0.018595 0.065567 -0.025075 0.144381 -0.000377 -0.002446 0.070251 0.090499 -0.112667 -0.250915 -0.282605 -0.142615 -0.049963 1.000000 0.189061 0.205481 0.010175 0.138909 0.045270 0.011927 0.018820 -0.193663 -0.022434 -0.035654
cityuvs 0.101187 0.000799 0.039251 0.010997 -0.001178 0.045452 0.023551 0.090860 0.018577 0.095031 0.041321 0.151089 0.041702 0.020399 0.059585 0.170056 0.030157 0.012648 0.041208 0.042023 -0.055406 0.030479 0.081078 0.046860 0.104632 0.077926 0.108625 0.085365 0.052718 0.086132 0.043150 0.047793 0.056401 0.055594 -0.022230 0.189061 1.000000 0.987370 0.014811 0.059975 0.010518 -0.012771 -0.037693 -0.298827 0.013699 -0.037376
cityorders 0.102336 0.000759 0.037020 0.031117 -0.000362 0.043500 0.027659 0.070892 0.013694 0.075434 0.031740 0.127710 0.039969 0.016766 0.029337 0.183577 0.034742 0.033255 0.031936 0.023651 -0.067485 0.031175 0.061015 0.044189 0.078952 0.088606 0.083775 0.065592 0.050835 0.085994 0.024742 0.017647 0.021987 0.032771 -0.024010 0.205481 0.987370 1.000000 0.014391 0.066153 0.010385 -0.012186 -0.041568 -0.297744 -0.006654 -0.050069
lastpvgap 0.010801 0.001157 -0.005508 0.008732 -0.000853 -0.054367 -0.069852 -0.017876 -0.058683 -0.017574 -0.043740 -0.018667 -0.022239 -0.008366 -0.020363 0.008094 -0.119503 0.006104 -0.045533 -0.027053 -0.049631 -0.017904 -0.014918 -0.095766 -0.021515 0.028619 -0.020732 -0.014033 -0.094942 -0.000505 -0.024253 -0.016644 -0.020083 -0.030310 0.155794 0.010175 0.014811 0.014391 1.000000 -0.026634 -0.116780 -0.041993 0.017968 -0.013236 0.015911 0.001467
cr 0.184888 0.000509 -0.000848 0.104793 -0.134614 -0.011540 0.181734 -0.008408 -0.007503 -0.005125 -0.060208 -0.002253 0.040365 0.056799 -0.045512 0.118519 0.282769 0.104715 -0.023160 -0.021063 0.003828 0.043167 -0.011920 0.183976 -0.014350 0.013600 -0.009001 -0.008970 0.159805 0.018554 -0.016253 -0.049527 -0.053981 -0.020870 -0.278271 0.138909 0.059975 0.066153 -0.026634 1.000000 0.057134 -0.029451 0.013188 -0.128660 -0.018347 -0.017570
sid 0.016431 0.002542 -0.036166 0.018775 0.018306 0.039231 0.241964 0.063053 0.136853 0.063840 0.130227 0.069283 0.137399 0.053669 0.051564 0.042884 0.316929 0.017455 0.165819 0.109760 0.147123 0.132286 0.054643 0.282234 0.072572 -0.062432 0.073381 0.054241 0.286329 0.022127 0.109393 0.045093 0.051593 0.128235 -0.003972 0.045270 0.010518 0.010385 -0.116780 0.057134 1.000000 -0.000815 -0.001223 0.021620 -0.013090 -0.010283
visitnum_oneyear -0.049722 0.000750 0.073917 0.012877 0.006269 0.108321 0.043761 0.005725 0.020643 0.007025 0.017015 0.004450 0.019858 0.020829 0.000132 0.010181 0.070454 0.020455 0.026597 0.023946 0.036845 0.017452 0.014873 0.060931 0.012653 -0.024932 0.015749 0.017952 0.065248 0.001160 0.016503 0.006259 0.002284 0.021339 -0.036863 0.011927 -0.012771 -0.012186 -0.041993 -0.029451 -0.000815 1.000000 -0.013557 -0.030330 -0.006335 0.000296
h -0.077728 0.001812 -0.012849 0.019573 -0.000564 0.001964 -0.003672 -0.002646 -0.002809 -0.002321 0.001819 -0.004663 0.006350 -0.000671 -0.011013 0.015272 -0.007748 0.014137 0.003963 -0.006463 -0.014246 0.007739 0.001200 -0.004408 -0.003364 0.010556 -0.002815 0.001227 0.000656 0.001640 -0.005723 -0.006515 -0.012470 -0.005808 0.013465 0.018820 -0.037693 -0.041568 0.017968 0.013188 -0.001223 -0.013557 1.000000 0.032661 -0.003044 -0.007706
day_advanced -0.153983 0.000224 -0.011125 -0.138218 0.002098 -0.034362 -0.067534 0.074934 0.031312 0.071222 0.083663 0.067196 0.009858 0.037552 0.136684 -0.164622 -0.081668 -0.142703 0.083902 0.130505 0.107125 -0.000622 0.085242 -0.004478 0.106090 -0.089451 0.099393 0.081802 -0.013352 -0.008466 0.135596 0.149208 0.166380 0.160922 0.077942 -0.193663 -0.298827 -0.297744 -0.013236 -0.128660 0.021620 -0.030330 0.032661 1.000000 -0.003262 -0.040864
arrival_weekday 0.002958 -0.001522 0.008014 -0.012310 0.005056 0.008332 -0.021776 -0.013678 -0.014619 -0.013623 -0.003290 -0.014954 -0.011647 0.005733 -0.013809 -0.022992 -0.039014 -0.012612 -0.006420 0.015271 -0.022543 -0.012919 -0.016114 -0.026940 -0.019686 0.013312 -0.020253 -0.016665 -0.029033 -0.004573 0.012064 -0.018078 -0.018840 0.014025 0.010573 -0.022434 0.013699 -0.006654 0.015911 -0.018347 -0.013090 -0.006335 -0.003044 -0.003262 1.000000 0.748189
is_arrival_weekend -0.009220 0.001464 0.007079 -0.018519 -0.001970 0.030208 -0.018639 -0.007720 -0.010775 -0.007789 -0.005326 -0.008705 -0.012393 -0.000249 -0.002814 -0.031647 -0.029886 -0.020682 -0.010198 0.011663 -0.011237 -0.012592 -0.007082 -0.024283 -0.007454 0.002962 -0.008146 -0.007869 -0.025471 -0.004313 0.008757 -0.004403 -0.002906 0.011842 0.003588 -0.035654 -0.037376 -0.050069 0.001467 -0.017570 -0.010283 0.000296 -0.007706 -0.040864 0.748189 1.000000

对用户特征相关分析¶

用户特征提取¶

In [ ]:
user_features=['visitnum_oneyear','starprefer','sid','price_sensitive','ordernum_oneyear','ordercanncelednum','ordercanceledprecent','lastpvgap',
'lasthtlordergap','landhalfhours','iforderpv_24h','historyvisit_totalordernum','historyvisit_avghotelnum','h',
'delta_price2','delta_price1','decisionhabit_user','customer_value_profit','ctrip_profits','cr','consuming_capacity','avgprice']

生成用户特征的相关性矩阵+热度图¶

In [ ]:
# train_corr_mat = train_data_rawdf[user_features].corr()
# test_corr_mat = test_data_rawdf[user_features].corr()

def corr_user(rawdf):
    missing_columns = [col for col in user_features if col not in rawdf.columns]
    copy_user_features = user_features.copy()
    copy_user_features = [col for col in copy_user_features if col not in missing_columns]
    print(copy_user_features)
    mat = rawdf[copy_user_features].corr()
    # 绘制用户特征的相关性矩阵热度图
    fig, ax = plt.subplots(figsize=(18,12))
    sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="YlGnBu")

    plt.show()

    return mat
train_data_rawdf_mat = corr_user(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = corr_user(test_data_rawdf)
# test_data_rawdf_mat
# 从热图中看出:
# >0.85
#   delta_price1和delta_price2的相关性高达0.92,前者表示用户偏好价格-24小时浏览最多酒店价格,后者表示用户偏好价格-24小时浏览酒店平均价格,说明浏览24小时内浏览最多的酒店价格会影响到浏览酒店的平均价格,这可以理解为众数和平均数的关系。因此可以选择PCA提取一个主成分表示用户价格偏好。
#   ordernum_oneyear和historyvisit_totalordernum的相关性高达1.0,两者都是表示用户1年内订单数,特征选取时可以只选择其一,这里选择ordernum_oneyear作为用户年订单数的特征,也可以用PCA降维;
#   decisionhabit_user和historyvisit_avghotelnum的相关性达到了0.94,前者表示用户决策习惯,后者表示近三个月用户日均访问酒店数。说明决策时间久的用户近三个月访问酒店数的平均影响也越多,反过来也是,访问的酒店越多,该用户决策时间越久。
#   customer_value_profit和ctrip_profits之间的相关性达到了0.85,前者表示用户近一年的价值,后者也表示用户价值,细分区别在于衡量的时间长度不同,这里也选择PCA提取一个主成分表示用户价值。
#   consuming_capacity和avgprice之间的相关性达到了0.88,前者表示用户消费能力指数,后者表示酒店平均价格。很明显,消费能力越高,所选择的酒店平均价格大概率也越高。这里选择consuming_capacity来代表用户消费能力特征,也可以考虑用PCA降维综合这两个特征。
['visitnum_oneyear', 'starprefer', 'sid', 'price_sensitive', 'ordernum_oneyear', 'ordercanncelednum', 'ordercanceledprecent', 'lastpvgap', 'lasthtlordergap', 'landhalfhours', 'iforderpv_24h', 'h', 'delta_price2', 'delta_price1', 'customer_value_profit', 'ctrip_profits', 'cr', 'consuming_capacity', 'avgprice']
No description has been provided for this image
Out[ ]:
visitnum_oneyear starprefer sid price_sensitive ordernum_oneyear ordercanncelednum ordercanceledprecent lastpvgap lasthtlordergap landhalfhours iforderpv_24h h delta_price2 delta_price1 customer_value_profit ctrip_profits cr consuming_capacity avgprice
visitnum_oneyear 1.000000 0.020643 -0.000815 0.020829 0.070454 0.043761 0.006269 -0.041993 -0.036863 0.108321 0.073917 -0.013557 0.017452 0.019858 0.060931 0.065248 -0.029451 0.017015 0.026597
starprefer 0.020643 1.000000 0.136853 0.038752 0.120942 0.072213 -0.069449 -0.058683 -0.052847 0.007570 -0.025380 -0.002809 0.401197 0.400740 0.385153 0.351759 -0.007503 0.728197 0.666243
sid -0.000815 0.136853 1.000000 0.053669 0.316929 0.241964 0.018306 -0.116780 -0.003972 0.039231 -0.036166 -0.001223 0.132286 0.137399 0.282234 0.286329 0.057134 0.130227 0.165819
price_sensitive 0.020829 0.038752 0.053669 1.000000 0.019737 0.007129 -0.033402 -0.008366 0.017887 -0.000761 -0.022761 -0.000671 0.224172 0.217116 0.154500 0.120446 0.056799 0.241250 0.245152
ordernum_oneyear 0.070454 0.120942 0.316929 0.019737 1.000000 0.697471 0.082158 -0.119503 -0.257930 0.024762 -0.014624 -0.007748 0.055956 0.060427 0.589498 0.513736 0.282769 0.059060 0.074879
ordercanncelednum 0.043761 0.072213 0.241964 0.007129 0.697471 1.000000 0.325672 -0.069852 -0.077737 0.011840 -0.015206 -0.003672 0.039229 0.043845 0.352043 0.323278 0.181734 0.014271 0.033386
ordercanceledprecent 0.006269 -0.069449 0.018306 -0.033402 0.082158 0.325672 1.000000 -0.000853 0.015938 -0.003579 0.012170 -0.000564 -0.073073 -0.072026 -0.019029 -0.016109 -0.134614 -0.064174 -0.069562
lastpvgap -0.041993 -0.058683 -0.116780 -0.008366 -0.119503 -0.069852 -0.000853 1.000000 0.155794 -0.054367 -0.005508 0.017968 -0.017904 -0.022239 -0.095766 -0.094942 -0.026634 -0.043740 -0.045533
lasthtlordergap -0.036863 -0.052847 -0.003972 0.017887 -0.257930 -0.077737 0.015938 0.155794 1.000000 -0.015678 -0.017532 0.013465 0.081158 0.077568 -0.140166 -0.096864 -0.278271 -0.000045 0.030867
landhalfhours 0.108321 0.007570 0.039231 -0.000761 0.024762 0.011840 -0.003579 -0.054367 -0.015678 1.000000 0.233447 0.001964 0.001920 0.010923 0.007725 0.006446 -0.011540 0.002310 0.003704
iforderpv_24h 0.073917 -0.025380 -0.036166 -0.022761 -0.014624 -0.015206 0.012170 -0.005508 -0.017532 0.233447 1.000000 -0.012849 -0.026489 -0.022466 -0.029591 -0.025420 -0.000848 -0.029280 -0.040083
h -0.013557 -0.002809 -0.001223 -0.000671 -0.007748 -0.003672 -0.000564 0.017968 0.013465 0.001964 -0.012849 1.000000 0.007739 0.006350 -0.004408 0.000656 0.013188 0.001819 0.003963
delta_price2 0.017452 0.401197 0.132286 0.224172 0.055956 0.039229 -0.073073 -0.017904 0.081158 0.001920 -0.026489 0.007739 1.000000 0.909763 0.337208 0.322430 0.043167 0.599844 0.724924
delta_price1 0.019858 0.400740 0.137399 0.217116 0.060427 0.043845 -0.072026 -0.022239 0.077568 0.010923 -0.022466 0.006350 0.909763 1.000000 0.334857 0.320884 0.040365 0.589737 0.707538
customer_value_profit 0.060931 0.385153 0.282234 0.154500 0.589498 0.352043 -0.019029 -0.095766 -0.140166 0.007725 -0.029591 -0.004408 0.337208 0.334857 1.000000 0.846496 0.183976 0.459208 0.490475
ctrip_profits 0.065248 0.351759 0.286329 0.120446 0.513736 0.323278 -0.016109 -0.094942 -0.096864 0.006446 -0.025420 0.000656 0.322430 0.320884 0.846496 1.000000 0.159805 0.420644 0.456162
cr -0.029451 -0.007503 0.057134 0.056799 0.282769 0.181734 -0.134614 -0.026634 -0.278271 -0.011540 -0.000848 0.013188 0.043167 0.040365 0.183976 0.159805 1.000000 -0.060208 -0.023160
consuming_capacity 0.017015 0.728197 0.130227 0.241250 0.059060 0.014271 -0.064174 -0.043740 -0.000045 0.002310 -0.029280 0.001819 0.599844 0.589737 0.459208 0.420644 -0.060208 1.000000 0.899684
avgprice 0.026597 0.666243 0.165819 0.245152 0.074879 0.033386 -0.069562 -0.045533 0.030867 0.003704 -0.040083 0.003963 0.724924 0.707538 0.490475 0.456162 -0.023160 0.899684 1.000000

酒店信息特征相关性分析¶

酒店特征¶

In [ ]:
hotel_features=['hotelcr','hoteluv','commentnums','novoters','cancelrate','lowestprice','cr_pre','uv_pre','uv_pre2','businessrate_pre',
'businessrate_pre2','customereval_pre2','commentnums_pre','commentnums_pre2','cancelrate_pre','novoters_pre','novoters_pre2',
'deltaprice_pre2_t1','lowestprice_pre','lowestprice_pre2','historyvisit_visit_detailpagenum']

生成酒店特征的相关性矩阵¶

In [ ]:
# corr_mat1 = rawdf[hotel_features].corr()

def corr_hotel(rawdf):
    missing_columns = [col for col in hotel_features if col not in rawdf.columns]
    copy_user_features = hotel_features.copy()
    copy_user_features = [col for col in copy_user_features if col not in missing_columns]
    print(copy_user_features)
    mat = rawdf[copy_user_features].corr()
    # 绘制用户特征的相关性矩阵热度图
    fig, ax = plt.subplots(figsize=(18,12))
    sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="Oranges")

    plt.show()

    return mat
train_data_rawdf_mat = corr_hotel(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = corr_hotel(test_data_rawdf)
# test_data_rawdf_mat
# > 0.86
#   novoters和commentnums的相关性高达0.99,两个特征高度相关。因此取commentnums特征进入后续的预测与分析就好,或者选择PCA提取一个主成分表示酒店评论数
#   cencelrate和commentnums三者的相关性也很高达到了0.86,可以看出酒店的评论数和取消率有很高的关系,可能是由于用户选择酒店后会查看酒店的相关评价,酒店的评论信息越多,用户对酒店也越了解,因此退订数量越少。因此要鼓励用户对酒店进行评价。
#   uv_pre和uv_pre2的相关性达到了0.9,它们都表示24小时历史浏览次数最多的酒店的独立访客数信息,因此可以选择PCA提取一个主成分分析表示4小时历史浏览次数最多的酒店的独立访客数信息。
#   commentnums_pre和novoters_pre的相关性高达0.99,两个特征高度相关。因此选择PCA提取一个主成分表示24小时历史浏览次数最多酒店点评数。
#   commentnums_pre2和novoters_pre2的相关性高达0.99,两个特征高度相关。因此选择PCA提取一个主成分表示24小时历史浏览次数最多酒店点评数均值。
['hotelcr', 'hoteluv', 'commentnums', 'novoters', 'cancelrate', 'lowestprice', 'cr_pre', 'uv_pre', 'uv_pre2', 'businessrate_pre', 'businessrate_pre2', 'customereval_pre2', 'commentnums_pre', 'commentnums_pre2', 'cancelrate_pre', 'novoters_pre', 'novoters_pre2', 'deltaprice_pre2_t1', 'lowestprice_pre', 'lowestprice_pre2']
No description has been provided for this image
Out[ ]:
hotelcr hoteluv commentnums novoters cancelrate lowestprice cr_pre uv_pre uv_pre2 businessrate_pre businessrate_pre2 customereval_pre2 commentnums_pre commentnums_pre2 cancelrate_pre novoters_pre novoters_pre2 deltaprice_pre2_t1 lowestprice_pre lowestprice_pre2
hotelcr 1.000000 -0.195894 0.077973 0.111510 0.145374 -0.212134 0.455088 -0.193813 -0.221810 0.393174 0.462603 -0.018209 -0.042322 -0.057016 0.045860 -0.028255 -0.037198 0.025691 -0.149879 -0.184937
hoteluv -0.195894 1.000000 0.682082 0.678037 0.756565 0.372250 -0.201410 0.611926 0.693862 -0.206681 -0.208543 0.401654 0.400708 0.508152 -0.255675 0.397479 0.497864 -0.011403 0.310592 0.364769
commentnums 0.077973 0.682082 1.000000 0.986627 0.837859 0.224027 -0.048611 0.405871 0.454397 -0.017608 -0.005398 0.366761 0.415256 0.509763 -0.214569 0.416398 0.511192 0.028964 0.220525 0.254302
novoters 0.111510 0.678037 0.986627 1.000000 0.853580 0.219171 -0.033836 0.403685 0.451455 -0.004879 0.009604 0.369402 0.417601 0.512748 -0.217408 0.422311 0.518597 0.033301 0.218488 0.251498
cancelrate 0.145374 0.756565 0.837859 0.853580 1.000000 0.246753 -0.022484 0.428912 0.480678 0.030727 0.048090 0.388280 0.408483 0.511110 -0.202260 0.413909 0.515620 0.042032 0.239832 0.276399
lowestprice -0.212134 0.372250 0.224027 0.219171 0.246753 1.000000 -0.157166 0.293774 0.336927 -0.081715 -0.100605 0.418775 0.197605 0.255902 -0.176545 0.194376 0.251468 0.044972 0.482142 0.565141
cr_pre 0.455088 -0.201410 -0.048611 -0.033836 -0.022484 -0.157166 1.000000 -0.224403 -0.249726 0.521287 0.531262 -0.005069 0.051888 -0.017631 -0.067697 0.084736 0.013853 0.030638 -0.215729 -0.230065
uv_pre -0.193813 0.611926 0.405871 0.403685 0.428912 0.293774 -0.224403 1.000000 0.899233 -0.287193 -0.250915 0.475066 0.684716 0.673500 -0.320180 0.681359 0.666136 -0.011070 0.371156 0.403663
uv_pre2 -0.221810 0.693862 0.454397 0.451455 0.480678 0.336927 -0.249726 0.899233 1.000000 -0.288367 -0.282605 0.533003 0.600708 0.739316 -0.330733 0.596813 0.727811 -0.016392 0.391096 0.456633
businessrate_pre 0.393174 -0.206681 -0.017608 -0.004879 0.030727 -0.081715 0.521287 -0.287193 -0.288367 1.000000 0.839675 -0.018557 -0.033132 -0.043829 0.113483 -0.017423 -0.022617 0.083459 -0.093702 -0.116378
businessrate_pre2 0.462603 -0.208543 -0.005398 0.009604 0.048090 -0.100605 0.531262 -0.250915 -0.282605 0.839675 1.000000 -0.019580 -0.018595 -0.025075 0.144381 -0.002446 -0.000377 0.090499 -0.112667 -0.142615
customereval_pre2 -0.018209 0.401654 0.366761 0.369402 0.388280 0.418775 -0.005069 0.475066 0.533003 -0.018557 -0.019580 1.000000 0.434567 0.542253 -0.442958 0.436519 0.545988 0.025923 0.516774 0.592722
commentnums_pre -0.042322 0.400708 0.415256 0.417601 0.408483 0.197605 0.051888 0.684716 0.600708 -0.033132 -0.018595 0.434567 1.000000 0.821527 -0.310531 0.986821 0.822742 0.038209 0.231631 0.261593
commentnums_pre2 -0.057016 0.508152 0.509763 0.512748 0.511110 0.255902 -0.017631 0.673500 0.739316 -0.043829 -0.025075 0.542253 0.821527 1.000000 -0.329677 0.815104 0.981899 0.045098 0.287523 0.332534
cancelrate_pre 0.045860 -0.255675 -0.214569 -0.217408 -0.202260 -0.176545 -0.067697 -0.320180 -0.330733 0.113483 0.144381 -0.442958 -0.310531 -0.329677 1.000000 -0.322329 -0.340087 -0.009206 -0.220220 -0.237741
novoters_pre -0.028255 0.397479 0.416398 0.422311 0.413909 0.194376 0.084736 0.681359 0.596813 -0.017423 -0.002446 0.436519 0.986821 0.815104 -0.322329 1.000000 0.833242 0.043769 0.226831 0.256747
novoters_pre2 -0.037198 0.497864 0.511192 0.518597 0.515620 0.251468 0.013853 0.666136 0.727811 -0.022617 -0.000377 0.545988 0.822742 0.981899 -0.340087 0.833242 1.000000 0.055052 0.282702 0.325686
deltaprice_pre2_t1 0.025691 -0.011403 0.028964 0.033301 0.042032 0.044972 0.030638 -0.011070 -0.016392 0.083459 0.090499 0.025923 0.038209 0.045098 -0.009206 0.043769 0.055052 1.000000 0.071691 0.080952
lowestprice_pre -0.149879 0.310592 0.220525 0.218488 0.239832 0.482142 -0.215729 0.371156 0.391096 -0.093702 -0.112667 0.516774 0.231631 0.287523 -0.220220 0.226831 0.282702 0.071691 1.000000 0.847433
lowestprice_pre2 -0.184937 0.364769 0.254302 0.251498 0.276399 0.565141 -0.230065 0.403663 0.456633 -0.116378 -0.142615 0.592722 0.261593 0.332534 -0.237741 0.256747 0.325686 0.080952 0.847433 1.000000

订单字段相关性分析¶

In [ ]:
order_features = [ 'day_advanced', 'arrival_weekday', 'is_arrival_weekend' ,'ordercanceledprecent' ,'ordercanncelednum', 
'lasthtlordergap', 'cityuvs', 'cityorders']

绘制订单特征的相关性矩阵热度图¶

In [ ]:
def order_hotel(rawdf):
    missing_columns = [col for col in order_features if col not in rawdf.columns]
    copy_user_features = order_features.copy()
    copy_user_features = [col for col in copy_user_features if col not in missing_columns]
    print(copy_user_features)
    mat = rawdf[copy_user_features].corr()
    # 绘制用户特征的相关性矩阵热度图
    fig, ax = plt.subplots(figsize=(18,12))
    sns.heatmap(mat, xticklabels=True, yticklabels=True, square=False, linewidths=.5, annot=True, cmap="Blues")

    plt.show()

    return mat
train_data_rawdf_mat = order_hotel(train_data_rawdf)
train_data_rawdf_mat
# test_data_rawdf_mat = order_hotel(test_data_rawdf)
# test_data_rawdf_mat
# 看出cityorders和cityuvs存在0.99的相关性,需要
# 
# 
# 降维
['day_advanced', 'arrival_weekday', 'is_arrival_weekend', 'ordercanceledprecent', 'ordercanncelednum', 'lasthtlordergap', 'cityuvs', 'cityorders']
No description has been provided for this image
Out[ ]:
day_advanced arrival_weekday is_arrival_weekend ordercanceledprecent ordercanncelednum lasthtlordergap cityuvs cityorders
day_advanced 1.000000 -0.003262 -0.040864 0.002098 -0.067534 0.077942 -0.298827 -0.297744
arrival_weekday -0.003262 1.000000 0.748189 0.005056 -0.021776 0.010573 0.013699 -0.006654
is_arrival_weekend -0.040864 0.748189 1.000000 -0.001970 -0.018639 0.003588 -0.037376 -0.050069
ordercanceledprecent 0.002098 0.005056 -0.001970 1.000000 0.325672 0.015938 -0.001178 -0.000362
ordercanncelednum -0.067534 -0.021776 -0.018639 0.325672 1.000000 -0.077737 0.023551 0.027659
lasthtlordergap 0.077942 0.010573 0.003588 0.015938 -0.077737 1.000000 -0.022230 -0.024010
cityuvs -0.298827 0.013699 -0.037376 -0.001178 0.023551 -0.022230 1.000000 0.987370
cityorders -0.297744 -0.006654 -0.050069 -0.000362 0.027659 -0.024010 0.987370 1.000000

分别筛选用户和酒店很相关的维度进行降维度¶

In [ ]:
# 降维是指在某些限定条件下,降低随机变量(特征)个数,得到一组“不相关”主变量的过程。
# 这里使用主成分分析(PCA) 对相关度大于0.8的变量进行降维。
# 由上图的相关性分析矩阵,我们分别筛选用户和酒店很相关的维度进行降维度。

c_value=['customer_value_profit','ctrip_profits']                   # 用户价值维度
consume_level=['avgprice','consuming_capacity']                     # 用户消费水平
price_prefer=['delta_price1','delta_price2']                        # 用户偏好价格
ordernum_1_year = ['ordernum_oneyear', ]                            # 用户一年历史订单数
hotel_hot=['commentnums','novoters']                                # 酒店热度
hotel_hot_pre=['commentnums_pre','novoters_pre']                    # 24h内浏览次数最多的酒店热度
hotel_hot_pre2=['commentnums_pre2','novoters_pre2']                 # 24h内酒浏览酒店的平均热度
hotel_uv_pre = ['uv_pre', 'uv_pre2']                                # 24小时历史浏览次数最多的酒店的独立访客数
order_cityuvs_orders = ['cityorders','cityuvs']                     # 昨日访问当前城市同入住日期的UV数和订单
In [ ]:
from sklearn.decomposition import PCA
pca=PCA(n_components=1)
def pca_data(rawdf):
    missing_columns = [col for col in c_value if col  in rawdf.columns]

    if len(missing_columns)==0:
        return
    print('PCA降维前维度是:{}'.format(rawdf.shape))  # (684128, 40)
    rawdf['c_value']=pca.fit_transform(rawdf[c_value])
    rawdf['consume_level']=pca.fit_transform(rawdf[consume_level])
    rawdf['price_prefer']=pca.fit_transform(rawdf[price_prefer])
    rawdf['ordernum_1_year'] = pca.fit_transform(rawdf[ordernum_1_year])
    rawdf['hotel_hot']=pca.fit_transform(rawdf[hotel_hot])
    rawdf['hotel_hot_pre']=pca.fit_transform(rawdf[hotel_hot_pre])
    rawdf['hotel_hot_pre2']=pca.fit_transform(rawdf[hotel_hot_pre2])
    rawdf['hotel_uv_pre']=pca.fit_transform(rawdf[hotel_uv_pre])
    rawdf['order_cityuvs_orders']=pca.fit_transform(rawdf[order_cityuvs_orders])
    rawdf.drop(c_value,axis=1,inplace=True)
    rawdf.drop(consume_level,axis=1,inplace=True)
    rawdf.drop(price_prefer,axis=1,inplace=True)
    rawdf.drop(ordernum_1_year,axis=1,inplace=True)
    rawdf.drop(hotel_hot,axis=1,inplace=True)
    rawdf.drop(hotel_hot_pre,axis=1,inplace=True)
    rawdf.drop(hotel_hot_pre2,axis=1,inplace=True)
    rawdf.drop(hotel_uv_pre,axis=1,inplace=True)
    rawdf.drop(order_cityuvs_orders,axis=1,inplace=True)
    print('PCA降维后维度是:{}'.format(rawdf.shape))  # (684128, 40)

pca_data(train_data_rawdf)
pca_data(test_data_rawdf)
PCA降维前维度是:(689945, 46)
PCA降维后维度是:(689945, 38)
PCA降维前维度是:(435075, 45)
PCA降维后维度是:(435075, 37)
In [ ]:
isSame(train_data_rawdf,test_data_rawdf)
字段不完全一样。
在train_data中独有的字段: Index(['label'], dtype='object')
在test_data中独有的字段: Index([], dtype='object')
In [ ]:
train_data_rawdf.head(1)
train_data_rawdf_copy = train_data_rawdf.copy()

标准化处理¶

In [ ]:
from sklearn.preprocessing import StandardScaler
y=train_data_rawdf['label']
x = train_data_rawdf.drop('label', axis=1)
# 标准化
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)
# 使用 fit 方法计算了数据的均值和标准差,并使用 transform 方法对数据进行了标准化
# X_test_x = test_data_rawdf

三、建模¶

导入模型建模¶

In [ ]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,random_state=80471)

逻辑回归¶

In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn import metrics
# clf = LogisticRegression()

# # 参数网格
# param_grid = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'penalty': ['l1', 'l2'],
#     'solver': ['liblinear', 'saga']
# }

# # GridSearchCV 或 RandomizedSearchCV
# lr = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

lr = LogisticRegression()

lr.fit(X_train, y_train)
y_prob = lr.predict_proba(X_test)[:, 1]  # 预测1类的概率
y_pred = lr.predict(X_test)  # 模型对测试集的预测结果
# print(y_prob)
# print(y_pred)
fpr_lr, tpr_lr, threshold_lr = metrics.roc_curve(y_test, y_prob)  # # 获取真阳率、伪阳率、阈值

print(f"fpr_lr: {fpr_lr}")
print(f"tpr_lr: {tpr_lr}")
auc_lr = metrics.auc(fpr_lr, tpr_lr)
score_lr = metrics.accuracy_score(y_test, y_pred)
# 计算精确度和召回率
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)


train_prob = lr.score(X_train, y_train)
test_prob = lr.score(X_test, y_test)





print(f"Training set score: {(1 - train_prob) * 1000}")
print(f"train_prob: {train_prob}")

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print('模型准确率为:{0}, AUC得分为:{1}'.format(score_lr, auc_lr))
print('============================================================')
print(classification_report(y_test, y_pred))
fpr_lr: [0.         0.         0.         ... 0.99918155 0.99918155 1.        ]
tpr_lr: [0.00000000e+00 2.64550265e-05 7.93650794e-05 ... 9.99973545e-01
 1.00000000e+00 1.00000000e+00]
Training set score: 262.23285914094595
train_prob: 0.737767140859054
Precision: 0.5792074592074592
Recall: 0.16433862433862434
模型准确率为:0.7383776967729312, AUC得分为:0.7013352216691405
============================================================
              precision    recall  f1-score   support

         0.0       0.75      0.95      0.84    100189
         1.0       0.58      0.16      0.26     37800

    accuracy                           0.74    137989
   macro avg       0.67      0.56      0.55    137989
weighted avg       0.70      0.74      0.68    137989

决策树¶

In [ ]:
from sklearn import tree


# 定义参数网格
param_grid = {
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

# 初始化决策树模型
grid_search = tree.DecisionTreeClassifier(random_state=42)

# 初始化 GridSearchCV 对象
dtc = GridSearchCV(estimator=grid_search, param_grid=param_grid, cv=5, scoring='accuracy')


# dtc = tree.DecisionTreeClassifier()                              # 建立决策树模型
dtc.fit(X_train,y_train)          
# 获取最佳模型
best_dtc = dtc.best_estimator_

                               # 训练模型
y_prob = best_dtc.predict_proba(X_test)[:,1]                          # 预测1类的概率
y_pred = best_dtc.predict(X_test)                                     # 模型对测试集的预测结果 
fpr_dtc,tpr_dtc,threshod_dtc= metrics.roc_curve(y_test,y_prob)   # 获取真阳率、伪阳率、阈值
score_dtc = metrics.accuracy_score(y_test,y_pred)                
auc_dtc = metrics.auc(fpr_dtc,tpr_dtc) 
print('模型准确率为:{0},AUC得分为:{1}'.format(score_dtc,auc_dtc))
print('============================================================')
print(classification_report(y_test,y_pred,labels=None,target_names=None,sample_weight=None, digits=2))
模型准确率为:0.7304785164034814,AUC得分为:0.632274001475835
============================================================
              precision    recall  f1-score   support

         0.0       0.75      0.95      0.84    100189
         1.0       0.53      0.14      0.22     37800

    accuracy                           0.73    137989
   macro avg       0.64      0.55      0.53    137989
weighted avg       0.69      0.73      0.67    137989

xgboost¶

In [ ]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn import metrics

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# 定义参数网格
# param_grid = {
#     'max_depth': [6, 8],
#     'eta': [0.1, 0.2],
#     'subsample': [0.6, 0.7],
#     'colsample_bytree': [0.7, 0.8],
#     'min_child_weight': [1, 3, 5],
#     'gamma': [0, 0.1, 0.2]
# }

param_grid = {'colsample_bytree': [0.8], 'eta': [0.2], 'gamma': [0], 'max_depth': [8], 'min_child_weight': [1], 'subsample': [0.7]}

# 初始化 XGBoost 模型
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', nthread=8, silent=1)

# 初始化 GridSearchCV 对象
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='roc_auc')

# 训练模型
grid_search.fit(X_train, y_train)

# 获取最佳参数
print("Best parameters:", grid_search.best_params_)

# 获取最佳模型
best_xgb_model = grid_search.best_estimator_

# 使用最佳模型进行预测
y_prob_best = best_xgb_model.predict_proba(X_test)[:, 1]
y_pred_best = (y_prob_best >= 0.5).astype(int)

# 计算最佳模型的性能指标
fpr_best_xgb, tpr_best_xgb, _ = metrics.roc_curve(y_test, y_prob_best)
auc_best_xgb = metrics.auc(fpr_best_xgb, tpr_best_xgb)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
accuracy_best_xgb = metrics.accuracy_score(y_test, y_pred_best)

print('模型准确率为:{0}, AUC得分为:{1}'.format(accuracy_best_xgb, auc_best_xgb))
print('============================================================')
print(classification_report(y_test, y_pred_best))



# y_prob_true_data = best_xgb_model.predict_proba(test_data_1)
# y_pred_true_data = (y_prob_true_data >= 0.5).astype(int)

# test_data_rawdf['Predicted'] = y_pred_true_data

# # 保存到 CSV 文件
# test_data_rawdf.to_csv('true_data_predictions.csv', index=False)


# # 读取训练集和测试集
# dtrain = xgb.DMatrix(X_train, y_train)
# dtest = xgb.DMatrix(X_test)
# # 设置xgboost建模参数
# params={ 
# 'booster':'gbtree','objective': 'binary:logistic','eval_metric': 'auc',
# 'max_depth':8,'gamma':0,'lambda':2,'subsample':0.7,'colsample_bytree':0.8,
# 'min_child_weight':3,'eta': 0.2,'nthread':8,'silent':1}
# # 训练模型
# watchlist = [(dtrain,'train')]
# bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
# # 输入预测为正确的概率
# y_prob = bst.predict(dtest)
# # 设置阈值为0.5,得到测试集的测试结果
# y_pred = (y_pred >= 0.5)*1
# # 获取真阳率、伪阳率、阈值
# fpr_xgb,tpr_xgb,threshold_xgb = metrics.roc_curve(y_test,y_prob)   
# auc_xgb = metrics.auc(fpr_xgb,tpr_xgb)    # AUC得分
# # 计算精确度和召回率
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)

# # print(f"Precision: {precision}")
# # print(f"Recall: {recall}")
# score_xgb = metrics.accuracy_score(y_test,y_pred)    # 模型准确率

# print('模型准确率为:{0},AUC得分为:{1}'.format(score_rfc,auc_rfc))
# print('============================================================')
# print(classification_report(y_test,y_pred,labels=None,target_names=None,sample_weight=None, digits=2))


# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
Best parameters: {'colsample_bytree': 0.8, 'eta': 0.2, 'gamma': 0, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.7}
模型准确率为:0.818789903543036, AUC得分为:0.8605628164356667
============================================================
              precision    recall  f1-score   support

         0.0       0.82      0.96      0.88    100189
         1.0       0.80      0.46      0.58     37800

    accuracy                           0.82    137989
   macro avg       0.81      0.71      0.73    137989
weighted avg       0.82      0.82      0.80    137989

XGBoost¶

阈值调整¶

画对比结果¶

In [ ]:
plt.style.use('bmh')
plt.figure(figsize=(10,10))
plt.plot(fpr_lr, tpr_lr, label='逻辑回归:%.3f' % score_lr )  # 逻辑回归
# plt.plot(fpr_gnb,tpr_gnb,label='gnb:{0:.3f}'.format(score_gnb))         # 朴素贝叶斯
# plt.plot(fpr_svc,tpr_svc,label='svc:{0:.3f}'.format(score_svc))         # 支持向量机
plt.plot(fpr_dtc,tpr_dtc,label='决策树:{0:.3f}'.format(score_dtc))         # 决策树
# plt.plot(fpr_rfc,tpr_rfc,label='rfc:{0:.3f}'.format(score_rfc))         # 随机森林
plt.plot(fpr_best_xgb, tpr_best_xgb,label='XGBoost:{0:.3f}'.format(accuracy_best_xgb))         # XGBoost
plt.plot([0, 1], [0, 1], 'k--', label='')
plt.legend(loc='lower right', prop={'size':15})
plt.xlabel('伪阳率')
plt.ylabel('真阳率')
plt.title('ROC曲线')
# plt.savefig('./images/模型比较ROC曲线图.jpg',dpi=400, bbox_inches='tight')
plt.show()
No description has been provided for this image

四、画像¶

RFM¶

In [ ]:
# RFM模型,即为:

# R(Rencency):最近一次消费
# F(Frequency):消费频率
# M(Monetary):消费金额
# 由于本数据集并没有直接给出这三个指标,经过分析,
# 选择选择lasthtlordergap(距离上次下单时长)、
# 和经过PCA降维处理的ordernum_1_year(用户年订单数)、consume_level(消费能力水平)分别作为R、F、M值,从而对我们的用户群体进行分群

rfm_features = ['lasthtlordergap','ordernum_1_year','consume_level']
rfm = train_data_rawdf[rfm_features]

# # 归一化(用于给RFM值打分)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(rfm)
rfm = pd.DataFrame(scaler.transform(rfm), columns=['recency', 'frequency','monetary'] )

# 分箱
rfm['R'] = pd.qcut(rfm['recency'], 2)
rfm['F'] = pd.qcut(rfm['frequency'], 2)
rfm['M'] = pd.qcut(rfm['monetary'], 2)

# # 根据分箱情况进行编码,二分类可以直接用标签编码方式
from sklearn.preprocessing import LabelEncoder
rfm['R'] = LabelEncoder().fit(rfm['R']).transform(rfm['R'])
rfm['F'] = LabelEncoder().fit(rfm['F']).transform(rfm['F'])
rfm['M'] = LabelEncoder().fit(rfm['M']).transform(rfm['M'])

#定义RFM模型,需要特别注意的是,R值代表距离上次消费时间间隔,值越小客户价值越高,与F和M值正好相反。
def get_label(r,f,m):
    if (r==0)&(f==1)&(m==1):
        return '高价值客户'
    if (r==1)&(f==1)&(m==1):
        return '重点保持客户'
    if((r==0)&(f==0)&(m==1)):
        return '重点发展客户'
    if (r==1)&(f==0)&(m==1):
        return '重点挽留客户'
    if (r==0)&(f==1)&(m==0):
        return '一般价值客户'
    if (r==1)&(f==1)&(m==0):
        return '一般保持客户'
    if (r==0)&(f==0)&(m==0):
        return '一般发展客户'
    if (r==1)&(f==0)&(m==0):
        return '潜在客户'
def RFM_convert(df):
    df['Label'] = df.apply(lambda x:get_label(x['R'], x['F'], x['M']), axis=1)
    df['R'] = np.where(df['R']==0, '高', '低')
    df['F'] = np.where(df['F']==1, '高', '低')
    df['M'] = np.where(df['M']==1, '高', '低')
    return df[['R','F','M','Label']]
rfm0 = RFM_convert(rfm)
rfm0.head()

# 可视化
# label_cnt = rfm0.groupby('Label').size()
label_cnt = rfm0['Label'].value_counts().values
labels = rfm0['Label'].value_counts().index
explode=[0.1,0.1,0.1,0,0,0,0,0]
plt.figure(figsize=(14,18))
# colors=['orangered','lightsalmon','sienna','seashell','chocolate','peru','sandybrown','peachpuff']
plt.pie(label_cnt, labels=labels,radius=1, explode=explode, autopct='%.1f%%',pctdistance=0.75,
wedgeprops={'linewidth':0.5,'edgecolor':'black'}, textprops={'fontsize':14,'color':'black'})
# plt.pie([1],radius=0.6,colors='w')
plt.title("RFM客户分群情况")
plt.legend(labels, fontsize=14, loc='best')
# plt.savefig('./images/客户分群情况.jpg',dpi=400, bbox_inches='tight')
plt.show()
No description has been provided for this image

K-means¶

选取出刻画用户的重要指标¶

In [ ]:
# # In [36]
# from xgboost import plot_importance
# # 解决f特征名字
# def ceate_feature_map(features):
#     outfile = open('xgb.fmap', 'w')
#     i = 0
#     for feat in features:
#         outfile.write('{0}\t{1}\tq\n'.format(i, feat))
#         i = i + 1
#     outfile.close()

# fig, ax = plt.subplots(figsize=(15,15))
# plot_importance(best_xgb_model, height=0.5, ax=ax, max_num_features=40, color='green')

# plt.savefig('./重要性特征图.jpg', dpi=400, bbox_inches='tight')
# ceate_feature_map(train_data_rawdf.columns)
# plt.show()

from xgboost import plot_importance
import matplotlib.pyplot as plt

# 修正函数名并解决特征名字映射问题
def create_feature_map(features):
    with open('xgb.fmap', 'w') as outfile:
        i = 0
        for feat in features:
            outfile.write('{0}\t{1}\tq\n'.format(i, feat))
            i += 1

# 绘制特征重要性图
fig, ax = plt.subplots(figsize=(15, 15))
plot_importance(best_xgb_model, height=0.5, ax=ax, max_num_features=40, color='green')

# 保存特征重要性图为图片文件
plt.savefig('./重要性特征图.jpg', dpi=400, bbox_inches='tight')

# 创建特征映射文件
create_feature_map(train_data_rawdf.columns)

# 显示图表
plt.show()
No description has been provided for this image
In [ ]:
import pandas as pd

# 获取特征重要性
importances = best_xgb_model.feature_importances_

# 获取特征名称(假设 train_data_rawdf 是您的训练数据集)
feature_names = train_data_rawdf.columns.tolist()
len(importances) 
# 37
len(feature_names)
# 38

new_feature_names = feature_names.copy()
# new_feature_names
# 假设 new_feature_names 是您的特征名称列表


# 使用列表推导式删除 'label' 和 'sampleid'
new_feature_names = [feat for feat in new_feature_names if feat not in ['label']]
# , 'sampleid'

# 打印更新后的特征名称列表
print(new_feature_names)
len(new_feature_names)
len(importances)

# # 创建一个DataFrame来存储特征名称和它们的重要性得分
feature_importances = pd.DataFrame({'feature': new_feature_names, 'importance': importances})

# # 根据重要性得分对特征进行降序排序
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# # 获取最重要的15个特征
top_15_features = feature_importances_sorted.head(15)

# # 打印最重要的15个特征及其重要性得分
print(top_15_features)
['sampleid', 'iforderpv_24h', 'hotelcr', 'ordercanceledprecent', 'landhalfhours', 'ordercanncelednum', 'starprefer', 'cancelrate', 'price_sensitive', 'hoteluv', 'businessrate_pre', 'cr_pre', 'lowestprice', 'customereval_pre2', 'cancelrate_pre', 'deltaprice_pre2_t1', 'lowestprice_pre', 'lowestprice_pre2', 'lasthtlordergap', 'businessrate_pre2', 'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h', 'day_advanced', 'arrival_weekday', 'is_arrival_weekend', 'c_value', 'consume_level', 'price_prefer', 'ordernum_1_year', 'hotel_hot', 'hotel_hot_pre', 'hotel_hot_pre2', 'hotel_uv_pre', 'order_cityuvs_orders']
              feature  importance
1       iforderpv_24h    0.114401
25       day_advanced    0.078948
21                 cr    0.074080
24                  h    0.057563
2             hotelcr    0.042464
19  businessrate_pre2    0.036986
31    ordernum_1_year    0.033841
18    lasthtlordergap    0.029704
4       landhalfhours    0.029696
23   visitnum_oneyear    0.027455
22                sid    0.023670
28            c_value    0.023612
11             cr_pre    0.023009
5   ordercanncelednum    0.022008
20          lastpvgap    0.021730